Created
November 15, 2024 04:01
-
-
Save pashu123/06669abdb06599448b5173ff10c67a69 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
module { | |
func.func @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%0 = util.call @_matmul_2048x512x1024_f32_f32() : () -> tensor<2048x512xf32> | |
%1 = hal.tensor.export %0 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %1 : !hal.buffer_view | |
} | |
util.func private @_matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
util.return %4 : tensor<2048x512xf32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%0 = util.call @_matmul_2048x512x1024_f32_f32() : () -> tensor<2048x512xf32> | |
%1 = hal.tensor.export %0 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
module { | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_cuda]} { | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = iree_encoding.set_encoding %0 : tensor<2048x1024xf32> -> tensor<2048x1024xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%3 = iree_encoding.set_encoding %1 : tensor<1024x512xf32> -> tensor<1024x512xf32, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%4 = tensor.empty() : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%6 = linalg.matmul ins(%2, %3 : tensor<2048x1024xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>, tensor<1024x512xf32, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) outs(%5 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%7 = iree_encoding.unset_encoding %6 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<2048x512xf32> | |
%8 = hal.tensor.export %7 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = flow.dispatch.region -> (tensor<2048x512xf32>) { | |
%6 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.return %6 : tensor<2048x512xf32> | |
} | |
%5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = flow.dispatch.region -> (tensor<2048x512xf32>) { | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%6 = linalg.fill ins(%cst_2 : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.return %7 : tensor<2048x512xf32> | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = flow.dispatch.region -> (tensor<2048x512xf32>) { | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%6 = linalg.fill ins(%cst_2 : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.return %7 : tensor<2048x512xf32> | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%5 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%7 = tensor.empty() : tensor<2048x512xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%8 = linalg.fill ins(%cst_2 : f32) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%9 = linalg.matmul ins(%5, %6 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%8 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> = | |
(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%2 = tensor.empty() : tensor<2048x512xf32> | |
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32> | |
%cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32> | |
%0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32> | |
%1 = util.optimization_barrier %cst : tensor<1024x512xf32> | |
%2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> | |
%3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view | |
util.return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1024x512xf32> in !stream.resource<constant> = dense<4.000000e-01> : tensor<1024x512xf32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2048x1024xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<2048x1024xf32> | |
%2 = stream.resource.size %cst_0 : !stream.resource<constant> | |
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%c0 = arith.constant 0 : index | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%cst = arith.constant 0.000000e+00 : f32 | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1024x512xf32> in !stream.resource<constant> = dense<4.000000e-01> : tensor<1024x512xf32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2048x1024xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<2048x1024xf32> | |
%2 = stream.resource.size %cst_0 : !stream.resource<constant> | |
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%c0 = arith.constant 0 : index | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 4.000000e-01 : f32 | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index | |
%1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0} | |
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index | |
%3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2} | |
%4 = util.optimization_barrier %3 : !stream.resource<*> | |
%5 = util.optimization_barrier %1 : !stream.resource<*> | |
%6 = stream.resource.size %4 : !stream.resource<*> | |
%7 = stream.resource.size %5 : !stream.resource<*> | |
%8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index | |
%9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8} | |
%11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<*> | |
%3 = util.optimization_barrier %0 : !stream.resource<*> | |
%4 = stream.resource.size %2 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<*> | |
%4 = stream.resource.size %1 : !stream.resource<*> | |
%5 = stream.resource.size %3 : !stream.resource<*> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%11 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %11 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%11 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %11 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.timepoint.immediate => !stream.timepoint | |
%7 = stream.timepoint.immediate => !stream.timepoint | |
%8 = stream.timepoint.join max(%6, %7) => !stream.timepoint | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%8) => with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%11 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %11 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%11 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %11 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%11 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %11 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%6 = stream.timepoint.immediate => !stream.timepoint | |
%7 = stream.timepoint.immediate => !stream.timepoint | |
%8 = stream.timepoint.join max(%6, %7) => !stream.timepoint | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%8) => with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%11 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %11 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} { | |
%8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608} | |
stream.yield %8 : !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608} | |
%1 = util.optimization_barrier %0 : !stream.resource<transient> | |
%results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} { | |
%8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152} | |
stream.yield %8 : !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152} | |
%3 = util.optimization_barrier %2 : !stream.resource<transient> | |
%4 = stream.resource.size %1 : !stream.resource<transient> | |
%5 = stream.resource.size %3 : !stream.resource<transient> | |
%results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} { | |
%8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} | |
stream.yield %8 : !stream.resource<external>{%c4194304} | |
} => !stream.timepoint | |
%6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%c0_3 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%c0_4 = arith.constant 0 : index | |
%result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%c0_3 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%c0_4 = arith.constant 0 : index | |
%result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%c0_3 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%c0_4 = arith.constant 0 : index | |
%result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%c0_3 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%c0_4 = arith.constant 0 : index | |
%result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%c0_4 = arith.constant 0 : index | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) { | |
ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%c0_4 = arith.constant 0 : index | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) { | |
ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) { | |
%0:3 = util.assume.int | |
%arg3<umin = 0, umax = 0>, | |
%arg4<umin = 0, umax = 0>, | |
%arg5<umin = 0, umax = 0> | |
: index, index, index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%6 = tensor.empty() : tensor<2048x512xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%c0_4 = arith.constant 0 : index | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) { | |
ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { | |
%0 = arith.extui %arg3 : i32 to i64 | |
%1 = arith.extui %arg4 : i32 to i64 | |
%c32_i64 = arith.constant 32 : i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg5 : i32 to i64 | |
%6 = arith.extui %arg6 : i32 to i64 | |
%c32_i64_0 = arith.constant 32 : i64 | |
%7 = arith.shli %6, %c32_i64_0 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg7 : i32 to i64 | |
%11 = arith.extui %arg8 : i32 to i64 | |
%c32_i64_1 = arith.constant 32 : i64 | |
%12 = arith.shli %11, %c32_i64_1 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15:3 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0> | |
: index, index, index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%21 = tensor.empty() : tensor<2048x512xf32> | |
%22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%c0_4 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i64_5 = arith.constant 0 : i64 | |
%c0_i32_6 = arith.constant 0 : i32 | |
%c0_i64_7 = arith.constant 0 : i64 | |
%c0_i32_8 = arith.constant 0 : i32 | |
%c32_i64_9 = arith.constant 32 : i64 | |
%c0_i64_10 = arith.constant 0 : i64 | |
%c0_i32_11 = arith.constant 0 : i32 | |
%c0_i64_12 = arith.constant 0 : i64 | |
%c0_i32_13 = arith.constant 0 : i32 | |
%c32_i64_14 = arith.constant 32 : i64 | |
%c0_i64_15 = arith.constant 0 : i64 | |
%c0_i32_16 = arith.constant 0 : i32 | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32_6, %c0_i32_8, %c0_i32_11, %c0_i32_13, %c0_i32_16 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg3 : i32 to i64 | |
%1 = arith.extui %arg4 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg5 : i32 to i64 | |
%6 = arith.extui %arg6 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg7 : i32 to i64 | |
%11 = arith.extui %arg8 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15:3 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0> | |
: index, index, index | |
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%21 = tensor.empty() : tensor<2048x512xf32> | |
%22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg3 : i32 to i64 | |
%1 = arith.extui %arg4 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg5 : i32 to i64 | |
%6 = arith.extui %arg6 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg7 : i32 to i64 | |
%11 = arith.extui %arg8 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15:3 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0> | |
: index, index, index | |
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%21 = tensor.empty() : tensor<2048x512xf32> | |
%22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg3 : i32 to i64 | |
%1 = arith.extui %arg4 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg5 : i32 to i64 | |
%6 = arith.extui %arg6 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg7 : i32 to i64 | |
%11 = arith.extui %arg8 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15:3 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0> | |
: index, index, index | |
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%21 = tensor.empty() : tensor<2048x512xf32> | |
%22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %c0_i32 : i32 to i64 | |
%1 = arith.extui %c0_i32 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %c0_i32 : i32 to i64 | |
%6 = arith.extui %c0_i32 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %c0_i32 : i32 to i64 | |
%11 = arith.extui %c0_i32 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15:3 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0> | |
: index, index, index | |
%16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%21 = tensor.empty() : tensor<2048x512xf32> | |
%22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { | |
hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@cuda_nvptx_fb::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // | |
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_cuda | |
hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { | |
hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
} | |
util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} { | |
%c4194304 = arith.constant 4194304 : index | |
%c1065353216_i32 = arith.constant 1065353216 : i32 | |
%c8388608 = arith.constant 8388608 : index | |
%c0 = arith.constant 0 : index | |
%c2097152 = arith.constant 2097152 : index | |
%c1053609165_i32 = arith.constant 1053609165 : i32 | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) { | |
stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608} | |
%2 = util.optimization_barrier %1 : !stream.resource<transient> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) { | |
stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152} | |
%5 = util.optimization_barrier %4 : !stream.resource<transient> | |
%6 = stream.resource.size %2 : !stream.resource<transient> | |
%7 = stream.resource.size %5 : !stream.resource<transient> | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint | |
%8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) { | |
stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@cuda_nvptx_fb::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 { | |
ro %arg0[%c0 for %6] : !stream.resource<transient>{%6}, | |
ro %arg1[%c0 for %7] : !stream.resource<transient>{%7}, | |
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304} | |
} | |
} => !stream.timepoint | |
%9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304} | |
%10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // | |
module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>) { | |
hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 { | |
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>) { | |
hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After GPUTensorTileToSerialLoopsPass (iree-codegen-gpu-tensor-tile-to-serial-loops) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<32x32xf32>, tensor<32x128xf32>) outs(%arg4 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.yield %9 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After GPUTensorAllocPass (iree-codegen-gpu-tensor-alloc) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32> | |
%11 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%9, %10 : tensor<32x32xf32>, tensor<32x128xf32>) outs(%arg4 : tensor<32x128xf32>) -> tensor<32x128xf32> | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After GPUTensorTilePass (iree-codegen-gpu-tensor-tile) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32> | |
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32> | |
%17 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32> | |
%18 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %16, %17, %18 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%20 = vector.transfer_write %19, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%21, %22] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%16 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32> | |
%17 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32> | |
%18 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %16, %17, %18 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%20 = vector.transfer_write %19, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%21, %22] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32> | |
%extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = tensor.empty() : tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> -> tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> -> tensor<2048x512xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32> | |
%extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32> | |
%7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32> | |
%9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32> | |
%10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32> | |
%11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6) | |
%extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32> | |
%14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32> | |
%15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32> | |
%16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
%18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %11 : tensor<32x128xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_4 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%3 = scf.for %arg2 = %c0 to %c1024 step %c32 iter_args(%arg3 = %subview_2) -> (memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg4, %arg5) in (8, 32) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%subview_7 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%6 = vector.transfer_read %alloc[%4, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%7 = vector.transfer_read %alloc_6[%c0, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%8 = vector.transfer_read %arg3[%4, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %8 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %9, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %arg3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %3, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_4 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%3 = scf.for %arg2 = %c0 to %c1024 step %c32 iter_args(%arg3 = %subview_2) -> (memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg4, %arg5) in (8, 32) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5) | |
%subview_7 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%6 = vector.transfer_read %alloc[%4, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%7 = vector.transfer_read %alloc_6[%c0, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%8 = vector.transfer_read %arg3[%4, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %8 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %9, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %arg3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %3, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_4 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_7 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_6[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_2, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_3, %subview_3 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_6, %subview_6 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
memref.copy %subview_2, %subview_2 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x4xf32> | |
%c32 = arith.constant 32 : index | |
%c1024 = arith.constant 1024 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) { | |
%subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg2, %arg3) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c0 to %c1024 step %c32 { | |
%subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>> | |
gpu.barrier | |
scf.forall (%arg3, %arg4) in (8, 32) { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) | |
%subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32> | |
%6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32> | |
%7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32> | |
vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
================================================================= | |
==2248266==ERROR: AddressSanitizer: heap-use-after-free on address 0x50e000021d2c at pc 0x7c64fbddc217 bp 0x7ffd4abe65f0 sp 0x7ffd4abe65e8 | |
READ of size 4 at 0x50e000021d2c thread T0 | |
#0 0x7c64fbddc216 in mlir::Operation::getRegions() /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:674:9 | |
#1 0x7c64fbddc216 in mlir::ForwardIterator::makeIterable(mlir::Operation&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Visitors.cpp:18:16 | |
#2 0x7c64fbb78331 in void mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<void (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:176:23 | |
#3 0x7c6501400efe in std::enable_if<!llvm::is_one_of<mlir::gpu::ThreadIdOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<void, void>::value, void>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp), mlir::gpu::ThreadIdOp, void>(mlir::Operation*, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:340:10 | |
#4 0x7c6501400efe in std::enable_if<llvm::function_traits<std::decay<void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)>::type>::num_args == 1, void>::type mlir::Operation::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp), void>(void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:794:12 | |
#5 0x7c6501400efe in void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp:408:11 | |
#6 0x7c6501400efe in mlir::transform::gpu::mapNestedForallToThreadsImpl(mlir::RewriterBase&, std::optional<mlir::transform::TransformOpInterface>, mlir::Operation*, llvm::ArrayRef<long>, long, bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp:875:3 | |
#7 0x7c650132fdb5 in mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)::operator()(mlir::scf::ForallOp) const /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp:51:18 | |
#8 0x7c650132fdb5 in std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&)::'lambda'(mlir::Operation*)::operator()(mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:375:14 | |
#9 0x7c650132fdb5 in mlir::WalkResult llvm::function_ref<mlir::WalkResult (mlir::Operation*)>::callback_fn<std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&)::'lambda'(mlir::Operation*)>(long, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#10 0x7c64fb8650a5 in mlir::WalkResult mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<mlir::WalkResult (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:273:13 | |
#11 0x7c64fb8650a5 in mlir::WalkResult mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<mlir::WalkResult (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:273:13 | |
#12 0x7c650132f600 in std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:378:10 | |
#13 0x7c650132f600 in std::enable_if<llvm::function_traits<std::decay<mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)>::type>::num_args == 1, mlir::WalkResult>::type mlir::Operation::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::WalkResult>(mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:794:12 | |
#14 0x7c650132f600 in mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp:46:37 | |
#15 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#16 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#17 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#18 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#19 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#20 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#21 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#22 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#23 0x7c650086cb34 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#24 0x7c650086cb34 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#25 0x7c650086cb34 in mlir::iree_compiler::(anonymous namespace)::LLVMGPULowerExecutableTargetPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp:173:14 | |
#26 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#27 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#28 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#29 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#30 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#31 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#32 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#33 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#34 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#35 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#36 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#37 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#38 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#39 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#40 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#41 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#42 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#43 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#44 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#45 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#46 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#47 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#48 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#49 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#50 0x7c64ffc0119f in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#51 0x7c64ffc0119f in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#52 0x7c64ffc0119f in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateTargetExecutableVariantsPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:68:16 | |
#53 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#54 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#55 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#56 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#57 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#58 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#59 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#60 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#61 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#62 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#63 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#64 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#65 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#66 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#67 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#68 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#69 0x7c64ffc02e76 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#70 0x7c64ffc02e76 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#71 0x7c64ffc02e76 in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateExecutablesPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:108:16 | |
#72 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#73 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#74 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#75 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#76 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#77 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#78 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#79 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#80 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#81 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#82 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#83 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#84 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#85 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#86 0x7c64fc03794e in mlir::PassManager::runPasses(mlir::Operation*, mlir::AnalysisManager) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:905:10 | |
#87 0x7c64fc03794e in mlir::PassManager::run(mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:885:60 | |
#88 0x7c64fb890e8e in mlir::iree_compiler::embed::(anonymous namespace)::Invocation::runPipeline(iree_compiler_pipeline_t) /home/nod/iree/compiler/src/iree/compiler/API/Internal/CompilerDriver.cpp:1008:27 | |
#89 0x7c64fb890e8e in ireeCompilerInvocationPipeline /home/nod/iree/compiler/src/iree/compiler/API/Internal/CompilerDriver.cpp:1447:23 | |
#90 0x7c64fbe80d19 in mlir::iree_compiler::runIreecMain(int, char**)::$_2::operator()(iree_compiler_source_t*) const /home/nod/iree/compiler/src/iree/compiler/Tools/iree_compile_lib.cc:254:12 | |
#91 0x7c64fbe7f7c4 in mlir::iree_compiler::runIreecMain(int, char**) /home/nod/iree/compiler/src/iree/compiler/Tools/iree_compile_lib.cc:355:10 | |
#92 0x7c64ed629d8f in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16 | |
#93 0x7c64ed629e3f in __libc_start_main csu/../csu/libc-start.c:392:3 | |
#94 0x5bfbeffbde54 in _start (/home/nod/iree-build/tools/iree-compile+0x67e54) (BuildId: 2c094ba9c7dc6f92) | |
0x50e000021d2c is located 44 bytes inside of 152-byte region [0x50e000021d00,0x50e000021d98) | |
freed by thread T0 here: | |
#0 0x5bfbf0057df6 in free (/home/nod/iree-build/tools/iree-compile+0x101df6) (BuildId: 2c094ba9c7dc6f92) | |
#1 0x7c64fbd15747 in mlir::Operation::destroy() /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:214:3 | |
#2 0x7c64fbd15747 in llvm::ilist_traits<mlir::Operation>::deleteNode(mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:492:7 | |
#3 0x7c64fbd15747 in llvm::iplist_impl<llvm::simple_ilist<mlir::Operation>, llvm::ilist_traits<mlir::Operation>>::erase(llvm::ilist_iterator<llvm::ilist_detail::node_options<mlir::Operation, false, false, void, false, void>, false, false>) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/ilist.h:205:5 | |
previously allocated by thread T0 here: | |
#0 0x5bfbf005808f in malloc (/home/nod/iree-build/tools/iree-compile+0x10208f) (BuildId: 2c094ba9c7dc6f92) | |
#1 0x7c64fbcfc71a in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::DictionaryAttr, mlir::OpaqueProperties, mlir::BlockRange, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:114:46 | |
#2 0x7c64fbcfbdc9 in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::NamedAttrList&&, mlir::OpaqueProperties, mlir::BlockRange, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:75:10 | |
#3 0x7c64fbcfbdc9 in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::NamedAttrList&&, mlir::OpaqueProperties, mlir::BlockRange, mlir::RegionRange) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:58:7 | |
#4 0x7c64fbcfba19 in mlir::Operation::create(mlir::OperationState const&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:36:7 | |
#5 0x7c64fbb7586f in mlir::OpBuilder::create(mlir::OperationState const&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Builders.cpp:498:17 | |
#6 0x7c6504ae8e42 in mlir::scf::(anonymous namespace)::ForallOpInterface::bufferize(mlir::Operation*, mlir::RewriterBase&, mlir::bufferization::BufferizationOptions const&) const /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp:1267:28 | |
#7 0x7c6504ae8e42 in mlir::bufferization::detail::BufferizableOpInterfaceInterfaceTraits::FallbackModel<mlir::scf::(anonymous namespace)::ForallOpInterface>::bufferize(mlir::bufferization::detail::BufferizableOpInterfaceInterfaceTraits::Concept const*, mlir::Operation*, mlir::RewriterBase&, mlir::bufferization::BufferizationOptions const&) /home/nod/iree-build/llvm-project/tools/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h.inc:1052:49 | |
#8 0x7c6504e1ab4e in mlir::bufferization::bufferizeOp(mlir::Operation*, mlir::bufferization::BufferizationOptions const&, mlir::bufferization::BufferizationStatistics*) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp:478:31 | |
#9 0x7c6502361a85 in mlir::iree_compiler::runIREEOneShotBufferize(mlir::Operation*, mlir::iree_compiler::IREEOneShotBufferizationOptions const&) /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp:213:10 | |
#10 0x7c6502361a85 in mlir::iree_compiler::(anonymous namespace)::IREEComprehensiveBufferizePass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp:230:14 | |
#11 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#12 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#13 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#14 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#15 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#16 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#17 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#18 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#19 0x7c650086cb34 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#20 0x7c650086cb34 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#21 0x7c650086cb34 in mlir::iree_compiler::(anonymous namespace)::LLVMGPULowerExecutableTargetPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp:173:14 | |
#22 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#23 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#24 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#25 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#26 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#27 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#28 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#29 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#30 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#31 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#32 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#33 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#34 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#35 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#36 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#37 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#38 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#39 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#40 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#41 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#42 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#43 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#44 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#45 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#46 0x7c64ffc0119f in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#47 0x7c64ffc0119f in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#48 0x7c64ffc0119f in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateTargetExecutableVariantsPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:68:16 | |
#49 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#50 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#51 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#52 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#53 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#54 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#55 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20 | |
#56 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5 | |
#57 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20 | |
#58 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#59 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#60 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#61 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
#62 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16 | |
#63 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12 | |
#64 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#65 0x7c64ffc02e76 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#66 0x7c64ffc02e76 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12 | |
#67 0x7c64ffc02e76 in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateExecutablesPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:108:16 | |
#68 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17 | |
#69 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12 | |
#70 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12 | |
#71 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7 | |
#72 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21 | |
SUMMARY: AddressSanitizer: heap-use-after-free /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:674:9 in mlir::Operation::getRegions() | |
Shadow bytes around the buggy address: | |
0x50e000021a80: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd | |
0x50e000021b00: fa fa fa fa fa fa fa fa fd fd fd fd fd fd fd fd | |
0x50e000021b80: fd fd fd fd fd fd fd fd fd fd fd fa fa fa fa fa | |
0x50e000021c00: fa fa fa fa fd fd fd fd fd fd fd fd fd fd fd fd | |
0x50e000021c80: fd fd fd fd fd fd fd fa fa fa fa fa fa fa fa fa | |
=>0x50e000021d00: fd fd fd fd fd[fd]fd fd fd fd fd fd fd fd fd fd | |
0x50e000021d80: fd fd fd fa fa fa fa fa fa fa fa fa 00 00 00 00 | |
0x50e000021e00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fa | |
0x50e000021e80: fa fa fa fa fa fa fa fa 00 00 00 00 00 00 00 00 | |
0x50e000021f00: 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa fa | |
0x50e000021f80: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa | |
Shadow byte legend (one shadow byte represents 8 application bytes): | |
Addressable: 00 | |
Partially addressable: 01 02 03 04 05 06 07 | |
Heap left redzone: fa | |
Freed heap region: fd | |
Stack left redzone: f1 | |
Stack mid redzone: f2 | |
Stack right redzone: f3 | |
Stack after return: f5 | |
Stack use after scope: f8 | |
Global redzone: f9 | |
Global init order: f6 | |
Poisoned by user: f7 | |
Container overflow: fc | |
Array cookie: ac | |
Intra object redzone: bb | |
ASan internal: fe | |
Left alloca redzone: ca | |
Right alloca redzone: cb | |
==2248266==ABORTING |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment