Created
December 12, 2022 18:30
-
-
Save pashu123/c9c01d1397a820dad48ecdd01e9ae5d5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump Before IREEImportPublic (iree-import-public) //----- // | |
#loc2 = loc("/home/prashant/test.mlir":3:22) | |
#loc3 = loc("/home/prashant/test.mlir":3:53) | |
#loc10 = loc("/home/prashant/test.mlir":10:10) | |
#loc11 = loc("/home/prashant/test.mlir":10:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:10) | |
#loc16 = loc("/home/prashant/test.mlir":15:20) | |
#loc17 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc4) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc5) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc6) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc7) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc8) | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc(#loc12) | |
linalg.yield %5 : f16 loc(#loc13) | |
} -> tensor<10x4096x4096xf16> loc(#loc9) | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc(#loc18) | |
%6 = arith.addf %in, %5 : f16 loc(#loc19) | |
linalg.yield %6 : f16 loc(#loc20) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
return %4 : tensor<10x4096x4096xf16> loc(#loc21) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":5:14) | |
#loc6 = loc("/home/prashant/test.mlir":6:10) | |
#loc7 = loc("/home/prashant/test.mlir":7:10) | |
#loc8 = loc("/home/prashant/test.mlir":8:10) | |
#loc9 = loc("/home/prashant/test.mlir":9:10) | |
#loc12 = loc("/home/prashant/test.mlir":11:12) | |
#loc13 = loc("/home/prashant/test.mlir":12:7) | |
#loc14 = loc("/home/prashant/test.mlir":14:10) | |
#loc18 = loc("/home/prashant/test.mlir":16:12) | |
#loc19 = loc("/home/prashant/test.mlir":17:12) | |
#loc20 = loc("/home/prashant/test.mlir":18:7) | |
#loc21 = loc("/home/prashant/test.mlir":20:5) | |
// -----// IR Dump Before ImportMLProgram (iree-import-ml-program) //----- // | |
#loc2 = loc("/home/prashant/test.mlir":3:22) | |
#loc3 = loc("/home/prashant/test.mlir":3:53) | |
#loc10 = loc("/home/prashant/test.mlir":10:10) | |
#loc11 = loc("/home/prashant/test.mlir":10:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:10) | |
#loc16 = loc("/home/prashant/test.mlir":15:20) | |
#loc17 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc4) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc5) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc6) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc7) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc8) | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc(#loc12) | |
linalg.yield %5 : f16 loc(#loc13) | |
} -> tensor<10x4096x4096xf16> loc(#loc9) | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc(#loc18) | |
%6 = arith.addf %in, %5 : f16 loc(#loc19) | |
linalg.yield %6 : f16 loc(#loc20) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
return %4 : tensor<10x4096x4096xf16> loc(#loc21) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":5:14) | |
#loc6 = loc("/home/prashant/test.mlir":6:10) | |
#loc7 = loc("/home/prashant/test.mlir":7:10) | |
#loc8 = loc("/home/prashant/test.mlir":8:10) | |
#loc9 = loc("/home/prashant/test.mlir":9:10) | |
#loc12 = loc("/home/prashant/test.mlir":11:12) | |
#loc13 = loc("/home/prashant/test.mlir":12:7) | |
#loc14 = loc("/home/prashant/test.mlir":14:10) | |
#loc18 = loc("/home/prashant/test.mlir":16:12) | |
#loc19 = loc("/home/prashant/test.mlir":17:12) | |
#loc20 = loc("/home/prashant/test.mlir":18:7) | |
#loc21 = loc("/home/prashant/test.mlir":20:5) | |
// -----// IR Dump Before SanitizeModuleNames (iree-sanitize-module-names) //----- // | |
#loc2 = loc("/home/prashant/test.mlir":3:22) | |
#loc3 = loc("/home/prashant/test.mlir":3:53) | |
#loc10 = loc("/home/prashant/test.mlir":10:10) | |
#loc11 = loc("/home/prashant/test.mlir":10:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:10) | |
#loc16 = loc("/home/prashant/test.mlir":15:20) | |
#loc17 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc4) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc5) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc6) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc7) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc8) | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc(#loc12) | |
linalg.yield %5 : f16 loc(#loc13) | |
} -> tensor<10x4096x4096xf16> loc(#loc9) | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc(#loc18) | |
%6 = arith.addf %in, %5 : f16 loc(#loc19) | |
linalg.yield %6 : f16 loc(#loc20) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
return %4 : tensor<10x4096x4096xf16> loc(#loc21) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":5:14) | |
#loc6 = loc("/home/prashant/test.mlir":6:10) | |
#loc7 = loc("/home/prashant/test.mlir":7:10) | |
#loc8 = loc("/home/prashant/test.mlir":8:10) | |
#loc9 = loc("/home/prashant/test.mlir":9:10) | |
#loc12 = loc("/home/prashant/test.mlir":11:12) | |
#loc13 = loc("/home/prashant/test.mlir":12:7) | |
#loc14 = loc("/home/prashant/test.mlir":14:10) | |
#loc18 = loc("/home/prashant/test.mlir":16:12) | |
#loc19 = loc("/home/prashant/test.mlir":17:12) | |
#loc20 = loc("/home/prashant/test.mlir":18:7) | |
#loc21 = loc("/home/prashant/test.mlir":20:5) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
#loc2 = loc("/home/prashant/test.mlir":3:22) | |
#loc3 = loc("/home/prashant/test.mlir":3:53) | |
#loc10 = loc("/home/prashant/test.mlir":10:10) | |
#loc11 = loc("/home/prashant/test.mlir":10:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:10) | |
#loc16 = loc("/home/prashant/test.mlir":15:20) | |
#loc17 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc4) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc5) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc6) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc7) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc8) | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc(#loc12) | |
linalg.yield %5 : f16 loc(#loc13) | |
} -> tensor<10x4096x4096xf16> loc(#loc9) | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc(#loc18) | |
%6 = arith.addf %in, %5 : f16 loc(#loc19) | |
linalg.yield %6 : f16 loc(#loc20) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
return %4 : tensor<10x4096x4096xf16> loc(#loc21) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":5:14) | |
#loc6 = loc("/home/prashant/test.mlir":6:10) | |
#loc7 = loc("/home/prashant/test.mlir":7:10) | |
#loc8 = loc("/home/prashant/test.mlir":8:10) | |
#loc9 = loc("/home/prashant/test.mlir":9:10) | |
#loc12 = loc("/home/prashant/test.mlir":11:12) | |
#loc13 = loc("/home/prashant/test.mlir":12:7) | |
#loc14 = loc("/home/prashant/test.mlir":14:10) | |
#loc18 = loc("/home/prashant/test.mlir":16:12) | |
#loc19 = loc("/home/prashant/test.mlir":17:12) | |
#loc20 = loc("/home/prashant/test.mlir":18:7) | |
#loc21 = loc("/home/prashant/test.mlir":20:5) | |
// -----// IR Dump Before Inliner (inline) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc2 = loc("/home/prashant/test.mlir":3:22) | |
#loc3 = loc("/home/prashant/test.mlir":3:53) | |
#loc10 = loc("/home/prashant/test.mlir":10:10) | |
#loc11 = loc("/home/prashant/test.mlir":10:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:10) | |
#loc16 = loc("/home/prashant/test.mlir":15:20) | |
#loc17 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = call @_forward(%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc1) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %3 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
func.func private @_forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc4) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc5) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc6) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc7) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc8) | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc(#loc12) | |
linalg.yield %5 : f16 loc(#loc13) | |
} -> tensor<10x4096x4096xf16> loc(#loc9) | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc(#loc18) | |
%6 = arith.addf %in, %5 : f16 loc(#loc19) | |
linalg.yield %6 : f16 loc(#loc20) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
return %4 : tensor<10x4096x4096xf16> loc(#loc21) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":5:14) | |
#loc6 = loc("/home/prashant/test.mlir":6:10) | |
#loc7 = loc("/home/prashant/test.mlir":7:10) | |
#loc8 = loc("/home/prashant/test.mlir":8:10) | |
#loc9 = loc("/home/prashant/test.mlir":9:10) | |
#loc12 = loc("/home/prashant/test.mlir":11:12) | |
#loc13 = loc("/home/prashant/test.mlir":12:7) | |
#loc14 = loc("/home/prashant/test.mlir":14:10) | |
#loc18 = loc("/home/prashant/test.mlir":16:12) | |
#loc19 = loc("/home/prashant/test.mlir":17:12) | |
#loc20 = loc("/home/prashant/test.mlir":18:7) | |
#loc21 = loc("/home/prashant/test.mlir":20:5) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func private @_forward(%arg0: tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:22), %arg1: tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:53)) -> tensor<10x4096x4096xf16> { | |
%cst = arith.constant 0.000000e+00 : f16 loc("/home/prashant/test.mlir":4:12) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc("/home/prashant/test.mlir":5:14) | |
%0 = tensor.empty() : tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":6:10) | |
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":7:10) | |
%2 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%1 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":8:10) | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%5 = arith.mulf %in, %cst_0 : f16 loc("/home/prashant/test.mlir":11:12) | |
linalg.yield %5 : f16 loc("/home/prashant/test.mlir":12:7) | |
} -> tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":9:10) | |
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %in_1, %cst : f16 loc("/home/prashant/test.mlir":16:12) | |
%6 = arith.addf %in, %5 : f16 loc("/home/prashant/test.mlir":17:12) | |
linalg.yield %6 : f16 loc("/home/prashant/test.mlir":18:7) | |
} -> tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":14:10) | |
return %4 : tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":20:5) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = call @_forward(%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%cst = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SymbolDCE (symbol-dce) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before LinalgNamedOpConversion (linalg-named-op-conversion) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ConvertConv2DToImg2Col (iree-flow-convert-conv2d-to-img2col) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before PadLinalgOps (iree-flow-pad-linalg-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before VerifyInputLegality (iree-verify-input-legality) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before ExpandTensorShapes (iree-flow-expand-tensor-shapes) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":10:20) | |
#loc13 = loc("/home/prashant/test.mlir":15:10) | |
#loc14 = loc("/home/prashant/test.mlir":15:20) | |
#loc15 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc19) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc20) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc21) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc22) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc23) | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(#loc25) | |
linalg.yield %8 : f16 loc(#loc26) | |
} -> tensor<10x4096x4096xf16> loc(#loc24) | |
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(#loc28) | |
%9 = arith.addf %in, %8 : f16 loc(#loc29) | |
linalg.yield %9 : f16 loc(#loc30) | |
} -> tensor<10x4096x4096xf16> loc(#loc27) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %7 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc2 = loc("/home/prashant/test.mlir":5:14) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":6:10) | |
#loc5 = loc("/home/prashant/test.mlir":7:10) | |
#loc6 = loc("/home/prashant/test.mlir":8:10) | |
#loc7 = loc("/home/prashant/test.mlir":9:10) | |
#loc10 = loc("/home/prashant/test.mlir":11:12) | |
#loc11 = loc("/home/prashant/test.mlir":12:7) | |
#loc12 = loc("/home/prashant/test.mlir":14:10) | |
#loc16 = loc("/home/prashant/test.mlir":16:12) | |
#loc17 = loc("/home/prashant/test.mlir":17:12) | |
#loc18 = loc("/home/prashant/test.mlir":18:7) | |
#loc19 = loc(callsite(#loc2 at #loc1)) | |
#loc20 = loc(callsite(#loc3 at #loc1)) | |
#loc21 = loc(callsite(#loc4 at #loc1)) | |
#loc22 = loc(callsite(#loc5 at #loc1)) | |
#loc23 = loc(callsite(#loc6 at #loc1)) | |
#loc24 = loc(callsite(#loc7 at #loc1)) | |
#loc25 = loc(callsite(#loc10 at #loc1)) | |
#loc26 = loc(callsite(#loc11 at #loc1)) | |
#loc27 = loc(callsite(#loc12 at #loc1)) | |
#loc28 = loc(callsite(#loc16 at #loc1)) | |
#loc29 = loc(callsite(#loc17 at #loc1)) | |
#loc30 = loc(callsite(#loc18 at #loc1)) | |
// -----// IR Dump Before ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %out: f16 loc("/home/prashant/test.mlir":10:20)): | |
%8 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":12:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":9:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %in, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":18:7 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = hal.tensor.export %6 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %7 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before LinalgDetensorize (linalg-detensorize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CollapseDims (iree-flow-collapse-dims) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SplitReduction (iree-flow-split-reduction-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before FormDispatchRegions (iree-flow-form-dispatch-regions) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = hal.tensor.export %5 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%c1 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c1_1 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c10, %c1_1] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0_2 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c1_3 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_2, %c4096, %c1_3] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0_4 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096_5 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c1_6 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_4, %c4096_5, %c1_6] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = flow.dispatch.region[%4, %5, %6] -> (tensor<10x4096x4096xf16>) { | |
%9 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_7: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%11 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.mulf %in_7, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%13 = arith.addf %11, %12 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %13 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %10 : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} count(%arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg4: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = hal.tensor.export %7 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %8 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch.workgroups[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> = | |
(%arg2: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.batch_matmul ins(%4, %5 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%7 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %6 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %9, %arg4, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} count(%arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg4: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch.workgroups[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> = | |
(%arg2: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.batch_matmul ins(%4, %5 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%7 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %6 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %9, %arg4, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} count(%arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg4: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch.workgroups[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> = | |
(%arg2: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.batch_matmul ins(%4, %5 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%7 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %6 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %9, %arg4, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} count(%arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg4: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc2 = loc("/home/prashant/test.mlir":14:10) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc2 at #loc1)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = flow.dispatch.workgroups[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> = | |
(%arg2: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%6 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%8 = linalg.batch_matmul ins(%4, %5 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%7 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %6 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%11 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%12 = arith.addf %10, %11 : f16 loc(#loc22) | |
linalg.yield %12 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %9, %arg4, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
flow.return loc(#loc14) | |
} count(%arg2: index loc(callsite(#loc2 at #loc1)), %arg3: index loc(callsite(#loc2 at #loc1)), %arg4: index loc(callsite(#loc2 at #loc1))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %3 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc1)) | |
#loc16 = loc(callsite(#loc4 at #loc1)) | |
#loc17 = loc(callsite(#loc5 at #loc1)) | |
#loc18 = loc(callsite(#loc6 at #loc1)) | |
#loc19 = loc(callsite(#loc7 at #loc1)) | |
#loc20 = loc(callsite(#loc11 at #loc1)) | |
#loc21 = loc(callsite(#loc12 at #loc1)) | |
#loc22 = loc(callsite(#loc13 at #loc1)) | |
// -----// IR Dump Before OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":3:3) | |
#loc2 = loc("/home/prashant/test.mlir":14:10) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc2 at #loc1)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc1) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc1) | |
%2 = flow.dispatch.workgroups[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> = | |
(%arg2: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>>, %arg3: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>>, %arg4: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%4 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%6 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%7 = linalg.fill ins(%cst_0 : f16) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%8 = linalg.batch_matmul ins(%4, %5 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%7 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8, %6 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%11 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%12 = arith.addf %10, %11 : f16 loc(#loc22) | |
linalg.yield %12 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %9, %arg4, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
flow.return loc(#loc14) | |
} count(%arg2: index loc(callsite(#loc2 at #loc1)), %arg3: index loc(callsite(#loc2 at #loc1)), %arg4: index loc(callsite(#loc2 at #loc1))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg2, %arg3, %arg4 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc1) | |
return %3 : !hal.buffer_view loc(#loc1) | |
} loc(#loc1) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc1)) | |
#loc16 = loc(callsite(#loc4 at #loc1)) | |
#loc17 = loc(callsite(#loc5 at #loc1)) | |
#loc18 = loc(callsite(#loc6 at #loc1)) | |
#loc19 = loc(callsite(#loc7 at #loc1)) | |
#loc20 = loc(callsite(#loc11 at #loc1)) | |
#loc21 = loc(callsite(#loc12 at #loc1)) | |
#loc22 = loc(callsite(#loc13 at #loc1)) | |
// -----// IR Dump Before StripDebugOps (iree-util-strip-debug-ops) //----- // | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.addf %6, %7 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before DeduplicateExecutables (iree-flow-deduplicate-executables) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.addf %6, %7 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.addf %6, %7 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %8 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SymbolDCE (symbol-dce) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before VerifyInput (iree-stream-verify-input) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before OutlineConstants (iree-stream-outline-constants) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc("/home/prashant/test.mlir":3:3) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before ConvertToStream (iree-stream-conversion) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
flow.executable private @forward_dispatch_0 { | |
flow.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
flow.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3), %arg1: !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3), %arg2: !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%2 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%3 = linalg.fill ins(%cst_0 : f16) outs(%2 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%4 = linalg.batch_matmul ins(%0, %1 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%3 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %2 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%2 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%6 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%7 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%8 = arith.addf %6, %7 : f16 loc(#loc22) | |
linalg.yield %8 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %5, %arg2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> loc(#loc2) | |
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> loc(#loc2) | |
%2 = flow.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0, %1) : (tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc14) | |
%3 = hal.tensor.export %2 : tensor<10x4096x4096xf16> -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c10_0 = arith.constant 10 : index loc(#loc2) | |
%c4096_1 = arith.constant 4096 : index loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10_0, %c4096_1, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
%c553648144_i32_2 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32_3 = arith.constant 1 : i32 loc(#loc2) | |
%c10_4 = arith.constant 10 : index loc(#loc2) | |
%c64_5 = arith.constant 64 : index loc(#loc2) | |
%c4096_6 = arith.constant 4096 : index loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10_4, %c64_5, %c4096_6]) type(%c553648144_i32_2) encoding(%c1_i32_3) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c10_0 = arith.constant 10 : index loc("/home/prashant/test.mlir":3:3) | |
%c4096_1 = arith.constant 4096 : index loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10_0, %c4096_1, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc("/home/prashant/test.mlir":3:3) | |
%c553648144_i32_2 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32_3 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c10_4 = arith.constant 10 : index loc("/home/prashant/test.mlir":3:3) | |
%c64_5 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
%c4096_6 = arith.constant 4096 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10_4, %c64_5, %c4096_6]) type(%c553648144_i32_2) encoding(%c1_i32_3) loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc("/home/prashant/test.mlir":3:3) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc("/home/prashant/test.mlir":3:3) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc("/home/prashant/test.mlir":3:3) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %9 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc("/home/prashant/test.mlir":3:3) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc("/home/prashant/test.mlir":3:3) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %9 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc("/home/prashant/test.mlir":3:3) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc("/home/prashant/test.mlir":3:3) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %9 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc(#loc2) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc(#loc2) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc(#loc2) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc(#loc2) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc(#loc2) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(#loc14) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(#loc14) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc(#loc2) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc(#loc2) | |
return %9 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before EncodeDeviceTensors (iree-stream-encode-device-tensors) //----- // | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before EncodeHostTensors (iree-stream-encode-host-tensors) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.sizeof tensor<10x4096x64xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%0} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.tensor.sizeof tensor<10x64x4096xf16> : index loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%3} loc("/home/prashant/test.mlir":3:3) | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.sizeof tensor<10x4096x4096xf16> : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%2[%c0 to %0 for %0], %5[%c0 to %3 for %3]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6} loc("/home/prashant/test.mlir":3:3) | |
%9 = stream.tensor.export %8 : tensor<10x4096x4096xf16> in !stream.resource<external>{%6} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %9 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before MaterializeBuiltins (iree-stream-materialize-builtins) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ElideAsyncCopies (iree-stream-elide-async-copies) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before EmplaceAllocations (iree-stream-emplace-allocations) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc("/home/prashant/test.mlir":3:3) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before RefineUsage (iree-stream-refine-usage) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c5242880} -> !stream.resource<*>{%c5242880} loc(#loc2) | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%1[%c0 to %c5242880 for %c5242880], %3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<*>{%c5242880}, !stream.resource<*>{%c5242880}) -> !stream.resource<*>{%c335544320} loc(#loc14) | |
%5 = stream.async.transfer %4 : !stream.resource<*>{%c335544320} -> !stream.resource<external>{%c335544320} loc(#loc2) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %6 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before ScheduleExecution (iree-stream-schedule-execution) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%0[%c0 to %c5242880 for %c5242880], %1[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ScheduleConcurrency (iree-stream-schedule-concurrency) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before PropagateTimepoints (iree-stream-propagate-timepoints) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.timepoint.immediate => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.timepoint.immediate => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.immediate => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%7 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.yield %7 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = stream.tensor.export %5 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %6 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(#loc14) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %3 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before ScheduleAllocation (iree-stream-schedule-allocation) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} { | |
%4 = stream.async.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%arg2[%c0 to %c5242880 for %c5242880], %arg3[%c0 to %c5242880 for %c5242880]) : (!stream.resource<external>{%c5242880}, !stream.resource<external>{%c5242880}) -> !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
stream.yield %4 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.tensor.export %2 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %3 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before PackConstants (iree-stream-pack-constants) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%c0_0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before PackAllocations (iree-stream-pack-allocations) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%c0_0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before LayoutSlices (iree-stream-layout-slices) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%c0_0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%c0_0 = arith.constant 0 : index loc(#loc14) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%c0_0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before VerifyLoweringToCmd (iree-stream-verify-lowering-to-cmd) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before SCFToControlFlow (convert-scf-to-cf) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before ElideTimepoints (iree-stream-elide-timepoints) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {iree.fixedpoint.iteration = 0 : index, torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before PackDispatchOperands (iree-stream-pack-dispatch-operands) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2)), %arg3: index loc("/home/prashant/test.mlir":3:3), %arg4: index loc("/home/prashant/test.mlir":3:3), %arg5: index loc(callsite(#loc1 at #loc2))) { | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%0 = arith.addi %c0, %arg3 : index loc(#loc2) | |
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%2 = arith.addi %c0, %arg4 : index loc(#loc2) | |
%3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%4 = arith.addi %c0, %arg5 : index loc(#loc14) | |
%5 = stream.binding.subspan %arg2[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%7 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%8 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%10 = linalg.batch_matmul ins(%6, %7 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%9 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%10, %8 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%8 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%12 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%13 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%14 = arith.addf %12, %13 : f16 loc(#loc22) | |
linalg.yield %14 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %11, %5, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%c0_0 = arith.constant 0 : index loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%c0, %c0, %c0 : index, index, index) { | |
ro %arg2[%c0_0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0_0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before CSE (cse) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2)), %arg3: i32 loc("/home/prashant/test.mlir":3:3), %arg4: i32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc("/home/prashant/test.mlir":3:3), %arg6: i32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: i32 loc(callsite(#loc1 at #loc2))) { | |
%0 = arith.extui %arg3 : i32 to i64 loc(#loc2) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc2) | |
%1 = arith.extui %arg4 : i32 to i64 loc(#loc2) | |
%2 = arith.shli %1, %c32_i64 : i64 loc(#loc2) | |
%3 = arith.ori %0, %2 : i64 loc(#loc2) | |
%4 = arith.index_castui %3 : i64 to index loc(#loc2) | |
%5 = arith.extui %arg5 : i32 to i64 loc(#loc2) | |
%c32_i64_0 = arith.constant 32 : i64 loc(#loc2) | |
%6 = arith.extui %arg6 : i32 to i64 loc(#loc2) | |
%7 = arith.shli %6, %c32_i64_0 : i64 loc(#loc2) | |
%8 = arith.ori %5, %7 : i64 loc(#loc2) | |
%9 = arith.index_castui %8 : i64 to index loc(#loc2) | |
%10 = arith.extui %arg7 : i32 to i64 loc(#loc14) | |
%c32_i64_1 = arith.constant 32 : i64 loc(#loc14) | |
%11 = arith.extui %arg8 : i32 to i64 loc(#loc14) | |
%12 = arith.shli %11, %c32_i64_1 : i64 loc(#loc14) | |
%13 = arith.ori %10, %12 : i64 loc(#loc14) | |
%14 = arith.index_castui %13 : i64 to index loc(#loc14) | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_2 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%15 = arith.addi %c0, %4 : index loc(#loc2) | |
%16 = stream.binding.subspan %arg0[%15] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%17 = arith.addi %c0, %9 : index loc(#loc2) | |
%18 = stream.binding.subspan %arg1[%17] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%19 = arith.addi %c0, %14 : index loc(#loc14) | |
%20 = stream.binding.subspan %arg2[%19] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%21 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%22 = flow.dispatch.tensor.load %18, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%23 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%24 = linalg.fill ins(%cst : f16) outs(%23 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%25 = linalg.batch_matmul ins(%21, %22 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%24 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%26 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25, %23 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%23 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_3: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%27 = arith.mulf %in, %cst_2 : f16 loc(#loc20) | |
%28 = arith.mulf %in_3, %cst : f16 loc(#loc21) | |
%29 = arith.addf %27, %28 : f16 loc(#loc22) | |
linalg.yield %29 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %26, %20, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%c0_0 = arith.constant 0 : index loc(#loc14) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc14) | |
%c0_i64_1 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32_2 = arith.constant 0 : i32 loc(#loc14) | |
%c0_i64_3 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32_4 = arith.constant 0 : i32 loc(#loc14) | |
%c32_i64_5 = arith.constant 32 : i64 loc(#loc14) | |
%c0_i64_6 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32_7 = arith.constant 0 : i32 loc(#loc14) | |
%c0_i64_8 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32_9 = arith.constant 0 : i32 loc(#loc14) | |
%c32_i64_10 = arith.constant 32 : i64 loc(#loc14) | |
%c0_i64_11 = arith.constant 0 : i64 loc(#loc14) | |
%c0_i32_12 = arith.constant 0 : i32 loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12 : i32, i32, i32, i32, i32, i32) { | |
ro %arg2[%c0_0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0_0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0_0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldUniformOperands (iree-stream-fold-uniform-operands) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2)), %arg3: i32 loc("/home/prashant/test.mlir":3:3), %arg4: i32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc("/home/prashant/test.mlir":3:3), %arg6: i32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: i32 loc(callsite(#loc1 at #loc2))) { | |
%0 = arith.extui %arg3 : i32 to i64 loc(#loc2) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc2) | |
%1 = arith.extui %arg4 : i32 to i64 loc(#loc2) | |
%2 = arith.shli %1, %c32_i64 : i64 loc(#loc2) | |
%3 = arith.ori %0, %2 : i64 loc(#loc2) | |
%4 = arith.index_castui %3 : i64 to index loc(#loc2) | |
%5 = arith.extui %arg5 : i32 to i64 loc(#loc2) | |
%6 = arith.extui %arg6 : i32 to i64 loc(#loc2) | |
%7 = arith.shli %6, %c32_i64 : i64 loc(#loc2) | |
%8 = arith.ori %5, %7 : i64 loc(#loc2) | |
%9 = arith.index_castui %8 : i64 to index loc(#loc2) | |
%10 = arith.extui %arg7 : i32 to i64 loc(#loc14) | |
%11 = arith.extui %arg8 : i32 to i64 loc(#loc14) | |
%12 = arith.shli %11, %c32_i64 : i64 loc(#loc14) | |
%13 = arith.ori %10, %12 : i64 loc(#loc14) | |
%14 = arith.index_castui %13 : i64 to index loc(#loc14) | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%15 = arith.addi %c0, %4 : index loc(#loc2) | |
%16 = stream.binding.subspan %arg0[%15] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%17 = arith.addi %c0, %9 : index loc(#loc2) | |
%18 = stream.binding.subspan %arg1[%17] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%19 = arith.addi %c0, %14 : index loc(#loc14) | |
%20 = stream.binding.subspan %arg2[%19] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%21 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%22 = flow.dispatch.tensor.load %18, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%23 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%24 = linalg.fill ins(%cst : f16) outs(%23 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%25 = linalg.batch_matmul ins(%21, %22 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%24 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%26 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25, %23 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%23 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%27 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%28 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%29 = arith.addf %27, %28 : f16 loc(#loc22) | |
linalg.yield %29 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %26, %20, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096](%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding loc(callsite(#loc1 at #loc2))) { | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%0 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc2) | |
%1 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%2 = arith.shli %1, %c32_i64 : i64 loc(#loc2) | |
%3 = arith.ori %0, %2 : i64 loc(#loc2) | |
%4 = arith.index_castui %3 : i64 to index loc(#loc2) | |
%5 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%6 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%7 = arith.shli %6, %c32_i64 : i64 loc(#loc2) | |
%8 = arith.ori %5, %7 : i64 loc(#loc2) | |
%9 = arith.index_castui %8 : i64 to index loc(#loc2) | |
%10 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%11 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%12 = arith.shli %11, %c32_i64 : i64 loc(#loc14) | |
%13 = arith.ori %10, %12 : i64 loc(#loc14) | |
%14 = arith.index_castui %13 : i64 to index loc(#loc14) | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%15 = arith.addi %c0, %4 : index loc(#loc2) | |
%16 = stream.binding.subspan %arg0[%15] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%17 = arith.addi %c0, %9 : index loc(#loc2) | |
%18 = stream.binding.subspan %arg1[%17] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%19 = arith.addi %c0, %14 : index loc(#loc14) | |
%20 = stream.binding.subspan %arg2[%19] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%21 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%22 = flow.dispatch.tensor.load %18, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%23 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%24 = linalg.fill ins(%cst : f16) outs(%23 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%25 = linalg.batch_matmul ins(%21, %22 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%24 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%26 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25, %23 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%23 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%27 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%28 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%29 = arith.addf %27, %28 : f16 loc(#loc22) | |
linalg.yield %29 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %26, %20, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before MemoizeChannels (iree-stream-memoize-channels) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%0 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc2) | |
%1 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%2 = arith.shli %1, %c32_i64 : i64 loc(#loc2) | |
%3 = arith.ori %0, %2 : i64 loc(#loc2) | |
%4 = arith.index_castui %3 : i64 to index loc(#loc2) | |
%5 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%6 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%7 = arith.shli %6, %c32_i64 : i64 loc(#loc2) | |
%8 = arith.ori %5, %7 : i64 loc(#loc2) | |
%9 = arith.index_castui %8 : i64 to index loc(#loc2) | |
%10 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%11 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%12 = arith.shli %11, %c32_i64 : i64 loc(#loc14) | |
%13 = arith.ori %10, %12 : i64 loc(#loc14) | |
%14 = arith.index_castui %13 : i64 to index loc(#loc14) | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%15 = arith.addi %c0, %4 : index loc(#loc2) | |
%16 = stream.binding.subspan %arg0[%15] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%17 = arith.addi %c0, %9 : index loc(#loc2) | |
%18 = stream.binding.subspan %arg1[%17] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%19 = arith.addi %c0, %14 : index loc(#loc14) | |
%20 = stream.binding.subspan %arg2[%19] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%21 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%22 = flow.dispatch.tensor.load %18, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%23 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%24 = linalg.fill ins(%cst : f16) outs(%23 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%25 = linalg.batch_matmul ins(%21, %22 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%24 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%26 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25, %23 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%23 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%27 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%28 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%29 = arith.addf %27, %28 : f16 loc(#loc22) | |
linalg.yield %29 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %26, %20, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0_i32 = arith.constant 0 : i32 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0_i32 = arith.constant 0 : i32 loc(#loc14) | |
%0 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%c32_i64 = arith.constant 32 : i64 loc(#loc2) | |
%1 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%2 = arith.shli %1, %c32_i64 : i64 loc(#loc2) | |
%3 = arith.ori %0, %2 : i64 loc(#loc2) | |
%4 = arith.index_castui %3 : i64 to index loc(#loc2) | |
%5 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%6 = arith.extui %c0_i32 : i32 to i64 loc(#loc2) | |
%7 = arith.shli %6, %c32_i64 : i64 loc(#loc2) | |
%8 = arith.ori %5, %7 : i64 loc(#loc2) | |
%9 = arith.index_castui %8 : i64 to index loc(#loc2) | |
%10 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%11 = arith.extui %c0_i32 : i32 to i64 loc(#loc14) | |
%12 = arith.shli %11, %c32_i64 : i64 loc(#loc14) | |
%13 = arith.ori %10, %12 : i64 loc(#loc14) | |
%14 = arith.index_castui %13 : i64 to index loc(#loc14) | |
%cst = arith.constant 0.000000e+00 : f16 loc(#loc15) | |
%cst_0 = arith.constant 1.250000e-01 : f16 loc(#loc16) | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%15 = arith.addi %c0, %4 : index loc(#loc2) | |
%16 = stream.binding.subspan %arg0[%15] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%17 = arith.addi %c0, %9 : index loc(#loc2) | |
%18 = stream.binding.subspan %arg1[%17] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%19 = arith.addi %c0, %14 : index loc(#loc14) | |
%20 = stream.binding.subspan %arg2[%19] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%21 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%22 = flow.dispatch.tensor.load %18, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%23 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%24 = linalg.fill ins(%cst : f16) outs(%23 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%25 = linalg.batch_matmul ins(%21, %22 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%24 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%26 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25, %23 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%23 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%27 = arith.mulf %in, %cst_0 : f16 loc(#loc20) | |
%28 = arith.mulf %in_1, %cst : f16 loc(#loc21) | |
%29 = arith.addf %27, %28 : f16 loc(#loc22) | |
linalg.yield %29 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %26, %20, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":4:12) | |
#loc4 = loc("/home/prashant/test.mlir":5:14) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before IPO (iree-util-ipo) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before SymbolDCE (symbol-dce) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before CSE (cse) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c5242880 = arith.constant 5242880 : index loc("/home/prashant/test.mlir":3:3) | |
%c335544320 = arith.constant 335544320 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c1_i32 = arith.constant 1 : i32 loc("/home/prashant/test.mlir":3:3) | |
%c64 = arith.constant 64 : index loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc("/home/prashant/test.mlir":3:3) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc("/home/prashant/test.mlir":3:3) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} => !stream.timepoint loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
return %5 : !hal.buffer_view loc("/home/prashant/test.mlir":3:3) | |
} loc("/home/prashant/test.mlir":3:3) | |
// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
module attributes {torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) //----- // | |
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
module attributes {hal.device.targets = [#device_target_vulkan], torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::{anonymous}::MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> | |
#loc1 = loc("/home/prashant/test.mlir":14:10) | |
#loc2 = loc("/home/prashant/test.mlir":3:3) | |
#loc8 = loc("/home/prashant/test.mlir":10:10) | |
#loc9 = loc("/home/prashant/test.mlir":15:20) | |
#loc10 = loc("/home/prashant/test.mlir":15:32) | |
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#loc14 = loc(callsite(#loc1 at #loc2)) | |
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
module attributes {hal.device.targets = [#device_target_vulkan], torch.debug_module_name = "_lambda"} { | |
stream.executable private @forward_dispatch_0 { | |
stream.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 workgroups(%arg0: index loc(callsite(#loc1 at #loc2)), %arg1: index loc(callsite(#loc1 at #loc2)), %arg2: index loc(callsite(#loc1 at #loc2))) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 loc(#loc14) | |
stream.return %x, %y, %z : index, index, index loc(#loc14) | |
} loc(#loc14) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64(%arg0: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg1: !stream.binding {stream.alignment = 64 : index} loc("/home/prashant/test.mlir":3:3), %arg2: !stream.binding {stream.alignment = 64 : index} loc(callsite(#loc1 at #loc2))) { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%cst = arith.constant 1.250000e-01 : f16 loc(#loc15) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(#loc16) | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc(#loc2) | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc(#loc2) | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(#loc14) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(#loc14) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(#loc17) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc18) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(#loc19) | |
%8 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(#loc20) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(#loc21) | |
%11 = arith.addf %9, %10 : f16 loc(#loc22) | |
linalg.yield %11 : f16 loc(#loc14) | |
} -> tensor<10x4096x4096xf16> loc(#loc14) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(#loc14) | |
return loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
} loc(#loc14) | |
func.func @forward(%arg0: !hal.buffer_view loc("/home/prashant/test.mlir":3:3), %arg1: !hal.buffer_view loc("/home/prashant/test.mlir":3:3)) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c0 = arith.constant 0 : index loc(#loc14) | |
%c5242880 = arith.constant 5242880 : index loc(#loc2) | |
%c335544320 = arith.constant 335544320 : index loc(#loc14) | |
%c10 = arith.constant 10 : index loc(#loc14) | |
%c4096 = arith.constant 4096 : index loc(#loc14) | |
%c553648144_i32 = arith.constant 553648144 : i32 loc(#loc2) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc2) | |
%c64 = arith.constant 64 : index loc(#loc2) | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c10, %c4096, %c64]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<10x4096x64xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c10, %c64, %c4096]) type(%c553648144_i32) encoding(%c1_i32) loc(#loc2) | |
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<10x64x4096xf16> in !stream.resource<external>{%c5242880} loc(#loc2) | |
%2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c335544320} loc(#loc14) | |
%3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c5242880}, %1 as %arg3: !stream.resource<external>{%c5242880}, %2 as %arg4: !stream.resource<external>{%c335544320}) { | |
stream.cmd.dispatch @forward_dispatch_0::@forward_dispatch_0_batch_matmul_10x4096x4096x64[%c10, %c4096, %c4096] { | |
ro %arg2[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
ro %arg3[%c0 for %c5242880] : !stream.resource<external>{%c5242880}, | |
wo %arg4[%c0 for %c335544320] : !stream.resource<external>{%c335544320} | |
} loc(#loc14) | |
} => !stream.timepoint loc(#loc14) | |
%4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c335544320} loc(#loc14) | |
%5 = stream.tensor.export %4 : tensor<10x4096x4096xf16> in !stream.resource<external>{%c335544320} -> !hal.buffer_view loc(#loc2) | |
return %5 : !hal.buffer_view loc(#loc2) | |
} loc(#loc2) | |
} loc(#loc) | |
#loc = loc("/home/prashant/test.mlir":2:1) | |
#loc3 = loc("/home/prashant/test.mlir":5:14) | |
#loc4 = loc("/home/prashant/test.mlir":4:12) | |
#loc5 = loc("/home/prashant/test.mlir":6:10) | |
#loc6 = loc("/home/prashant/test.mlir":7:10) | |
#loc7 = loc("/home/prashant/test.mlir":8:10) | |
#loc11 = loc("/home/prashant/test.mlir":11:12) | |
#loc12 = loc("/home/prashant/test.mlir":16:12) | |
#loc13 = loc("/home/prashant/test.mlir":17:12) | |
#loc15 = loc(callsite(#loc3 at #loc2)) | |
#loc16 = loc(callsite(#loc4 at #loc2)) | |
#loc17 = loc(callsite(#loc5 at #loc2)) | |
#loc18 = loc(callsite(#loc6 at #loc2)) | |
#loc19 = loc(callsite(#loc7 at #loc2)) | |
#loc20 = loc(callsite(#loc11 at #loc2)) | |
#loc21 = loc(callsite(#loc12 at #loc2)) | |
#loc22 = loc(callsite(#loc13 at #loc2)) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) //----- // | |
hal.executable private @forward_dispatch_0 { | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> { | |
hal.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { | |
^bb0(%arg0: !hal.device loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
hal.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> { | |
hal.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { | |
^bb0(%arg0: !hal.device loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
hal.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before SPIRVLowerExecutableTarget (iree-spirv-lower-executable-target-pass) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> { | |
hal.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { | |
^bb0(%arg0: !hal.device loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
hal.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> { | |
hal.executable.export public @forward_dispatch_0_batch_matmul_10x4096x4096x64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {subgroup_size = 32 : index, translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>, workgroup_size = [64 : index, 2 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg1: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg2: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)), %arg3: index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3))): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
hal.return %x, %y, %z : index, index, index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
builtin.module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<10x4096x64xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<10x64x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = tensor.empty() : tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":6:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = linalg.fill ins(%cst_0 : f16) outs(%5 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%3, %4 : tensor<10x4096x64xf16>, tensor<10x64x4096xf16>) outs(%6 : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %5 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%5 : tensor<10x4096x4096xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_1: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%9 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%10 = arith.mulf %in_1, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.addf %9, %10 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %11 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<10x4096x4096xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0], sizes = [10, 4096, 4096], strides = [1, 1, 1] : tensor<10x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c128 = arith.constant 128 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, %c64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x?x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x?xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%9 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%cast = tensor.cast %8 : tensor<1x64x?xf16> to tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%cast_1 = tensor.cast %7 : tensor<1x?x64xf16> to tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%cast_1, %cast : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11, %12 : tensor<1x64x128xf16>, tensor<1x64x128xf16>) outs(%13 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":10:10), %in_3: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%15 = arith.mulf %in, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.mulf %in_3, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%17 = arith.addf %15, %16 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %17 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cast_2 = tensor.cast %14 : tensor<1x64x128xf16> to tensor<1x?x?xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %cast_2, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %c64, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c128 = arith.constant 128 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %c64, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> -> tensor<1x?x?xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cast = tensor.cast %7 : tensor<1x?x?xf16> to tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, %c64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x?x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x?xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%cast : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%cast_1 = tensor.cast %9 : tensor<1x64x?xf16> to tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%cast_2 = tensor.cast %8 : tensor<1x?x64xf16> to tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%cast_2, %cast_1 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x64x128xf16>) outs(%11 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%14 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.addf %14, %15 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %16 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cast_3 = tensor.cast %13 : tensor<1x64x128xf16> to tensor<1x?x?xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %cast_3, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %c64, %c128], strides = [1, 1, 1] : tensor<1x?x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before CSE (cse) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%7 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%8, %9 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x64x128xf16>) outs(%11 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%14 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.addf %14, %15 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %16 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : tensor<1x64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%7 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%8, %9 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x64x128xf16>) outs(%11 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%14 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.addf %14, %15 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %16 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : tensor<1x64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%7 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%8, %9 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = tensor.empty() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x64x128xf16>) outs(%11 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%14 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.addf %14, %15 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %16 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : tensor<1x64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> -> tensor<1x64x64xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = linalg.fill ins(%cst_0 : f16) outs(%7 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%8, %9 : tensor<1x64x64xf16>, tensor<1x64x128xf16>) outs(%10 : tensor<1x64x128xf16>) -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = bufferization.alloc_tensor() : tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x64x128xf16>) outs(%11 : tensor<1x64x128xf16>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%14 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%16 = arith.addf %14, %15 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %16 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} -> tensor<1x64x128xf16> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 64, 128], strides = [1, 1, 1] : tensor<1x64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %2, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %4, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %6 to %c4096 step %7 { | |
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %8 to %c4096 step %9 { | |
%subview = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %2[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview, %subview_3 : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %2, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %4, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %6 to %c4096 step %7 { | |
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %8 to %c4096 step %9 { | |
%subview = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %2[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview, %subview_3 : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %2, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %4, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %6 to %c4096 step %7 { | |
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %8 to %c4096 step %9 { | |
%subview = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %2[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview, %subview_3 : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %2, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %4, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %6 to %c4096 step %7 { | |
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %8 to %c4096 step %9 { | |
%subview = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %2[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview, %subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x4096x64xf16>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %2, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<10x64x4096xf16>> loc("/home/prashant/test.mlir":3:3) | |
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %4, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<10x4096x4096xf16>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %6 to %c4096 step %7 { | |
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %8 to %c4096 step %9 { | |
%subview = memref.subview %4[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %2[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%10 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%11 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%12 = arith.addf %10, %11 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %12 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before SPIRVTileAndPromote (iree-spirv-tile-and-promote) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_2 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_1, %subview_2 : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c10 = arith.constant 10 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c4096 = arith.constant 4096 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%c32 = arith.constant 32 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c1 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c128 = arith.constant 128 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() : memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_1 = memref.alloc() : memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_2 = memref.alloc() : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %workgroup_id_z to %c10 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %3 to %c4096 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %5 to %c4096 step %6 { | |
%subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_4 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} ins(%cst_0 : f16) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%subview_6 = memref.subview %subview_3[0, 0, %arg3] [1, 64, 32] [1, 1, 1] : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview_4[0, %arg3, 0] [1, 32, 128] [1, 1, 1] : memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_8 = memref.subview %alloc_1[0, 0, 0] [%c1, %c64, %c32] [1, 1, 1] : memref<1x64x32xf16, 3> to memref<?x?x?xf16, strided<[2048, 32, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_9 = memref.subview %alloc[0, 0, 0] [%c1, %c32, %c128] [1, 1, 1] : memref<1x32x128xf16, 3> to memref<?x?x?xf16, strided<[4096, 128, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_6, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?x?xf16, strided<[2048, 32, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_7, %subview_9 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?x?xf16, strided<[4096, 128, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_8, %subview_9 : memref<?x?x?xf16, strided<[2048, 32, 1]>, 3>, memref<?x?x?xf16, strided<[4096, 128, 1]>, 3>) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, 3> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_5 = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "workgroup_memory"} { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%7 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%8 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%9 = arith.addf %7, %8 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %9 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%c32 = arith.constant 32 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c1 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c128 = arith.constant 128 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() : memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_1 = memref.alloc() : memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_2 = memref.alloc() : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview = memref.subview %2[%workgroup_id_z, %3, %4] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %0[%workgroup_id_z, %3, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_4 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} ins(%cst_0 : f16) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %c0 to %c64 step %c32 { | |
%subview_6 = memref.subview %subview_3[0, 0, %arg0] [1, 64, 32] [1, 1, 1] : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview_4[0, %arg0, 0] [1, 32, 128] [1, 1, 1] : memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_8 = memref.subview %alloc_1[0, 0, 0] [%c1, %c64, %c32] [1, 1, 1] : memref<1x64x32xf16, 3> to memref<?x?x?xf16, strided<[2048, 32, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_9 = memref.subview %alloc[0, 0, 0] [%c1, %c32, %c128] [1, 1, 1] : memref<1x32x128xf16, 3> to memref<?x?x?xf16, strided<[4096, 128, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_6, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?x?xf16, strided<[2048, 32, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_7, %subview_9 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?x?xf16, strided<[4096, 128, 1]>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_8, %subview_9 : memref<?x?x?xf16, strided<[2048, 32, 1]>, 3>, memref<?x?x?xf16, strided<[4096, 128, 1]>, 3>) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, 3> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_5 = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "workgroup_memory"} { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%6 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.addf %5, %6 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %7 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before CSE (cse) //----- // | |
module { | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%c32 = arith.constant 32 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() : memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_1 = memref.alloc() : memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_2 = memref.alloc() : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview = memref.subview %2[%workgroup_id_z, %3, %4] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %0[%workgroup_id_z, %3, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_4 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} ins(%cst_0 : f16) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %c0 to %c64 step %c32 { | |
%subview_6 = memref.subview %subview_3[0, 0, %arg0] [1, 64, 32] [1, 1, 1] : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview_4[0, %arg0, 0] [1, 32, 128] [1, 1, 1] : memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_6, %alloc_1 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_7, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%alloc_1, %alloc : memref<1x64x32xf16, 3>, memref<1x32x128xf16, 3>) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, 3> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_5 = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "workgroup_memory"} { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%6 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.addf %5, %6 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %7 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before SPIRVTileToCooperativeOps (iree-spirv-tile-to-cooperative-ops) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%c32 = arith.constant 32 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() : memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_1 = memref.alloc() : memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_2 = memref.alloc() : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview = memref.subview %2[%workgroup_id_z, %3, %4] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %0[%workgroup_id_z, %3, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_4 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} ins(%cst_0 : f16) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %c0 to %c64 step %c32 { | |
%subview_6 = memref.subview %subview_3[0, 0, %arg0] [1, 64, 32] [1, 1, 1] : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview_4[0, %arg0, 0] [1, 32, 128] [1, 1, 1] : memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_6, %alloc_1 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_7, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%alloc_1, %alloc : memref<1x64x32xf16, 3>, memref<1x32x128xf16, 3>) outs(%alloc_2 : memref<1x64x128xf16, 3>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, 3> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_5 = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<1x64x128xf16, 3>) outs(%subview : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "workgroup_memory"} { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%5 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%6 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%7 = arith.addf %5, %6 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %7 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
// -----// IR Dump Before GPUMultiBuffering (iree-gpu-multi-buffering) //----- // | |
func.func @forward_dispatch_0_batch_matmul_10x4096x4096x64() { | |
%c0 = arith.constant 0 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c1 = arith.constant 1 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%c64 = arith.constant 64 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c32 = arith.constant 32 : index loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%c128 = arith.constant 128 : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%cst = arith.constant 1.250000e-01 : f16 loc(callsite("/home/prashant/test.mlir":5:14 at "/home/prashant/test.mlir":3:3)) | |
%cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("/home/prashant/test.mlir":4:12 at "/home/prashant/test.mlir":3:3)) | |
%alloc = memref.alloc() : memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_1 = memref.alloc() : memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_2 = memref.alloc() : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %0, 64 : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
memref.assume_alignment %1, 64 : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> loc("/home/prashant/test.mlir":3:3) | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
memref.assume_alignment %2, 64 : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview = memref.subview %2[%workgroup_id_z, %3, %4] [1, 64, 128] [1, 1, 1] : memref<10x4096x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_3 = memref.subview %0[%workgroup_id_z, %3, 0] [1, 64, 64] [1, 1, 1] : memref<10x4096x64xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_4 = memref.subview %1[%workgroup_id_z, 0, %4] [1, 64, 128] [1, 1, 1] : memref<10x64x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%5 = gpu.thread_id x loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%6 = gpu.thread_id y loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
%7 = gpu.thread_id z loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %7 to %c1 step %c1 { | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%6] loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %11 to %c64 step %c64 { | |
%12 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 64)>()[%5] loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %12 to %c128 step %c128 { | |
%subview_6 = memref.subview %alloc_2[%arg0, %arg1, %arg2] [1, 32, 64] [1, 1, 1] : memref<1x64x128xf16, 3> to memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3> loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst_0 : f16) outs(%subview_6 : memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3>) loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":7:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %c0 to %c64 step %c32 { | |
%subview_6 = memref.subview %subview_3[0, 0, %arg0] [1, 64, 32] [1, 1, 1] : memref<1x64x64xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview_4[0, %arg0, 0] [1, 32, 128] [1, 1, 1] : memref<1x64x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_6, %alloc_1 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x32xf16, strided<[262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x32xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %subview_7, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x32x128xf16, strided<[262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x128xf16, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%11 = gpu.thread_id x loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%12 = gpu.thread_id y loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%13 = gpu.thread_id z loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %13 to %c1 step %c1 { | |
%14 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%12] loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %14 to %c64 step %c64 { | |
%15 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 64)>(%11) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg3 = %15 to %c128 step %c128 { | |
%subview_8 = memref.subview %alloc_1[%arg1, %arg2, 0] [1, 32, 32] [1, 1, 1] : memref<1x64x32xf16, 3> to memref<1x32x32xf16, strided<[2048, 32, 1], offset: ?>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_9 = memref.subview %alloc[%arg1, 0, %arg3] [1, 32, 64] [1, 1, 1] : memref<1x32x128xf16, 3> to memref<1x32x64xf16, strided<[4096, 128, 1], offset: ?>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_10 = memref.subview %alloc_2[%arg1, %arg2, %arg3] [1, 32, 64] [1, 1, 1] : memref<1x64x128xf16, 3> to memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.batch_matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>} ins(%subview_8, %subview_9 : memref<1x32x32xf16, strided<[2048, 32, 1], offset: ?>, 3>, memref<1x32x64xf16, strided<[4096, 128, 1], offset: ?>, 3>) outs(%subview_10 : memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3>) loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x64x128xf16, 3> to memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
gpu.barrier loc(callsite("/home/prashant/test.mlir":8:10 at "/home/prashant/test.mlir":3:3)) | |
%alloc_5 = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%8 = gpu.thread_id x loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%9 = gpu.thread_id y loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%10 = gpu.thread_id z loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg0 = %10 to %c1 step %c1 { | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg1 = %11 to %c64 step %c64 { | |
%12 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 64)>()[%8] loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
scf.for %arg2 = %12 to %c128 step %c128 { | |
%subview_6 = memref.subview %alloc_5[%arg0, %arg1, %arg2] [1, 32, 64] [1, 1, 1] : memref<1x64x128xf16, 3> to memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
%subview_7 = memref.subview %subview[%arg0, %arg1, %arg2] [1, 32, 64] [1, 1, 1] : memref<1x64x128xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x64xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x32x64xf16, strided<[8192, 128, 1], offset: ?>, 3>) outs(%subview_7 : memref<1x32x64xf16, strided<[16777216, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {__internal_linalg_transform__ = "vectorize"} { | |
^bb0(%in: f16 loc("/home/prashant/test.mlir":15:20), %out: f16 loc("/home/prashant/test.mlir":15:32)): | |
%13 = arith.mulf %out, %cst : f16 loc(callsite("/home/prashant/test.mlir":11:12 at "/home/prashant/test.mlir":3:3)) | |
%14 = arith.mulf %in, %cst_0 : f16 loc(callsite("/home/prashant/test.mlir":16:12 at "/home/prashant/test.mlir":3:3)) | |
%15 = arith.addf %13, %14 : f16 loc(callsite("/home/prashant/test.mlir":17:12 at "/home/prashant/test.mlir":3:3)) | |
linalg.yield %15 : f16 loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
return loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
} loc(callsite("/home/prashant/test.mlir":14:10 at "/home/prashant/test.mlir":3:3)) | |
/home/prashant/test.mlir:14:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>]>>}> | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^ | |
/home/prashant/test.mlir:3:3: note: called from | |
func.func @forward(%arg0: tensor<10x4096x64xf16>, %arg1: tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> { | |
^ | |
/home/prashant/test.mlir:14:10: error: failed to serialize executables | |
%4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3, %0 : tensor<10x4096x4096xf16>, tensor<10x4096x4096xf16>) outs(%0 : tensor<10x4096x4096xf16>) { | |
^ | |
/home/prashant/test.mlir:3:3: note: called from | |
func.func @forward(%arg0: tensor<10x4096x64xf16>, %arg1: tensor<10x64x4096xf16>) -> tensor<10x4096x4096xf16> { | |
^ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment