Created
January 13, 2025 09:45
-
-
Save pashu123/82d8e5dc7c8f87b82e0b76ce3bcda5c7 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
func.func @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = util.call @_attention(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func private @_attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = tensor.empty() : tensor<20x4096x64xf16> | |
%1 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%arg0, %arg1, %arg2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%0 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
util.return %1 : tensor<20x4096x64xf16> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = util.call @_attention(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
module { | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_local]} { | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeHostEncodingPass (iree-codegen-materialize-host-encoding) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%transposed = linalg.transpose ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) permutation = [0, 2, 1] | |
%5 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %transposed, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%6 = hal.tensor.export %5 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %6 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [#map2, #map3, #map4, #map5, #map6]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = tensor.empty() : tensor<20x64x4096xf16> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2, d1)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<20x4096x64xf16>) outs(%4 : tensor<20x64x4096xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<20x64x4096xf16> | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %5, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x64x4096xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%7 = hal.tensor.export %6 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %7 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = flow.dispatch.region -> (tensor<20x4096x64xf16>) { | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.return %6 : tensor<20x4096x64xf16> | |
} | |
%5 = hal.tensor.export %4 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.region -> (tensor<20x4096x64xf16>) { | |
%5 = tensor.empty() : tensor<20x4096x64xf16> | |
%cst = arith.constant 1.250000e-01 : f16 | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%5 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.return %6 : tensor<20x4096x64xf16> | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.region -> (tensor<20x4096x64xf16>) { | |
%5 = tensor.empty() : tensor<20x4096x64xf16> | |
%cst = arith.constant 1.250000e-01 : f16 | |
%6 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%5 : tensor<20x4096x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.return %6 : tensor<20x4096x64xf16> | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%cst = arith.constant 1.250000e-01 : f16 | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch.workgroups(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg4: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %arg6, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
flow.executable private @attention_dispatch_0 { | |
flow.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg2: !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%3 = tensor.empty() : tensor<20x4096x64xf16> | |
%4 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%0, %1, %2, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%3 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %4, %arg3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<20x4096x64xf16> | |
%3 = flow.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0, %1, %2) : (tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> | |
%4 = hal.tensor.export %3 "output0" : tensor<20x4096x64xf16> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%cst = arith.constant 1.250000e-01 : f16 | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
%c20 = arith.constant 20 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%element_type_f16_0 = hal.element_type<f16> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
%c20_2 = arith.constant 20 : index | |
%c4096_3 = arith.constant 4096 : index | |
%c64_4 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20_2, %c4096_3, %c64_4]) type(%element_type_f16_0) encoding(%dense_row_major_1) | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%3} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} | |
%element_type_f16_5 = hal.element_type<f16> : i32 | |
%dense_row_major_6 = hal.encoding_type<dense_row_major> : i32 | |
%c20_7 = arith.constant 20 : index | |
%c4096_8 = arith.constant 4096 : index | |
%c64_9 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20_7, %c4096_8, %c64_9]) type(%element_type_f16_5) encoding(%dense_row_major_6) | |
%6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%7 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%6} | |
%c0 = arith.constant 0 : index | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3], %8[%c0 to %6 for %6]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9} | |
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<20x4096x64xf16> in !stream.resource<external>{%9} -> !hal.buffer_view | |
util.return %12 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%cst = arith.constant 1.250000e-01 : f16 | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
%c20 = arith.constant 20 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%element_type_f16_0 = hal.element_type<f16> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
%c20_2 = arith.constant 20 : index | |
%c4096_3 = arith.constant 4096 : index | |
%c64_4 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20_2, %c4096_3, %c64_4]) type(%element_type_f16_0) encoding(%dense_row_major_1) | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%3} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} | |
%element_type_f16_5 = hal.element_type<f16> : i32 | |
%dense_row_major_6 = hal.encoding_type<dense_row_major> : i32 | |
%c20_7 = arith.constant 20 : index | |
%c4096_8 = arith.constant 4096 : index | |
%c64_9 = arith.constant 64 : index | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20_7, %c4096_8, %c64_9]) type(%element_type_f16_5) encoding(%dense_row_major_6) | |
%6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%7 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%6} | |
%c0 = arith.constant 0 : index | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3], %8[%c0 to %6 for %6]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9} | |
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<20x4096x64xf16> in !stream.resource<external>{%9} -> !hal.buffer_view | |
util.return %12 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%element_type_f16_0 = hal.element_type<f16> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16_0) encoding(%dense_row_major_1) | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%3} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} | |
%element_type_f16_2 = hal.element_type<f16> : i32 | |
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16_2) encoding(%dense_row_major_3) | |
%6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%7 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%6} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3], %8[%c0 to %6 for %6]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9} | |
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<20x4096x64xf16> in !stream.resource<external>{%9} -> !hal.buffer_view | |
util.return %12 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%element_type_f16_0 = hal.element_type<f16> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16_0) encoding(%dense_row_major_1) | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%3} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} | |
%element_type_f16_2 = hal.element_type<f16> : i32 | |
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16_2) encoding(%dense_row_major_3) | |
%6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%7 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%6} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3], %8[%c0 to %6 for %6]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9} | |
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<20x4096x64xf16> in !stream.resource<external>{%9} -> !hal.buffer_view | |
util.return %12 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%element_type_f16_0 = hal.element_type<f16> : i32 | |
%dense_row_major_1 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16_0) encoding(%dense_row_major_1) | |
%3 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%3} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%3} | |
%element_type_f16_2 = hal.element_type<f16> : i32 | |
%dense_row_major_3 = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16_2) encoding(%dense_row_major_3) | |
%6 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%7 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%6} | |
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%6} | |
%9 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%10 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %5[%c0 to %3 for %3], %8[%c0 to %6 for %6]) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9} | |
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%9} | |
%12 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %11 : tensor<20x4096x64xf16> in !stream.resource<external>{%9} -> !hal.buffer_view | |
util.return %12 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<20x4096x64xf16> : index | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%4 = stream.async.transfer %3 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%5 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%0} | |
%6 = stream.async.transfer %5 : !stream.resource<external>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0} | |
%7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%2[%c0 to %0 for %0], %4[%c0 to %0 for %0], %6[%c0 to %0 for %0]) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0} | |
%8 = stream.async.transfer %7 : !stream.resource<*>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%0} | |
%9 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %8 : tensor<20x4096x64xf16> in !stream.resource<external>{%0} -> !hal.buffer_view | |
util.return %9 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%1 = stream.async.transfer %0 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.transfer %2 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%4 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%5 = stream.async.transfer %4 : !stream.resource<external>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c10485760} | |
%6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%1[%c0 to %c10485760 for %c10485760], %3[%c0 to %c10485760 for %c10485760], %5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}, !stream.resource<*>{%c10485760}) -> !stream.resource<*>{%c10485760} | |
%7 = stream.async.transfer %6 : !stream.resource<*>{%c10485760} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%0[%c0 to %c10485760 for %c10485760], %1[%c0 to %c10485760 for %c10485760], %2[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.timepoint.immediate => !stream.timepoint | |
%4 = stream.timepoint.immediate => !stream.timepoint | |
%5 = stream.timepoint.immediate => !stream.timepoint | |
%6 = stream.timepoint.join max(%3, %4, %5) => !stream.timepoint | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%6) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%9 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %9 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%3 = stream.timepoint.immediate => !stream.timepoint | |
%4 = stream.timepoint.immediate => !stream.timepoint | |
%5 = stream.timepoint.immediate => !stream.timepoint | |
%6 = stream.timepoint.join max(%3, %4, %5) => !stream.timepoint | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%6) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%9 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %9 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} { | |
%5 = stream.async.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg3[%c0 to %c10485760 for %c10485760], %arg4[%c0 to %c10485760 for %c10485760], %arg5[%c0 to %c10485760 for %c10485760]) : (!stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}, !stream.resource<external>{%c10485760}) -> !stream.resource<external>{%c10485760} | |
stream.yield %5 : !stream.resource<external>{%c10485760} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c10485760} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%arg5] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%arg6] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%arg7] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg8: f32): | |
iree_linalg_ext.yield %arg8 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0, %c0, %c0, %c0 : index, index, index, index) { | |
ro %arg3[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.values = [0 : index]}, %arg7: index {stream.values = [0 : index]}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%arg5] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%arg6] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%arg7] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg8: f32): | |
iree_linalg_ext.yield %arg8 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0, %c0, %c0, %c0 : index, index, index, index) { | |
ro %arg3[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}, %arg6: index {stream.values = [0 : index]}, %arg7: index {stream.values = [0 : index]}) { | |
%0:4 = util.assume.int | |
%arg4<umin = 0, umax = 0>, | |
%arg5<umin = 0, umax = 0>, | |
%arg6<umin = 0, umax = 0>, | |
%arg7<umin = 0, umax = 0> | |
: index, index, index, index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%4 = stream.binding.subspan %arg3[%0#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%8 = tensor.empty() : tensor<20x4096x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%5, %6, %7, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%8 : tensor<20x4096x64xf16>) { | |
^bb0(%arg8: f32): | |
iree_linalg_ext.yield %arg8 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %9, %4, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0, %c0, %c0, %c0 : index, index, index, index) { | |
ro %arg3[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%0 = arith.extui %arg4 : i32 to i64 | |
%1 = arith.extui %arg5 : i32 to i64 | |
%c32_i64 = arith.constant 32 : i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg6 : i32 to i64 | |
%6 = arith.extui %arg7 : i32 to i64 | |
%c32_i64_0 = arith.constant 32 : i64 | |
%7 = arith.shli %6, %c32_i64_0 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg8 : i32 to i64 | |
%11 = arith.extui %arg9 : i32 to i64 | |
%c32_i64_1 = arith.constant 32 : i64 | |
%12 = arith.shli %11, %c32_i64_1 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15 = arith.extui %arg10 : i32 to i64 | |
%16 = arith.extui %arg11 : i32 to i64 | |
%c32_i64_2 = arith.constant 32 : i64 | |
%17 = arith.shli %16, %c32_i64_2 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 {stream.values = [0 : index]} : i64 to index | |
%20:4 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0>, | |
%19<umin = 0, umax = 0> | |
: index, index, index, index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%21 = stream.binding.subspan %arg0[%20#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%22 = stream.binding.subspan %arg1[%20#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%23 = stream.binding.subspan %arg2[%20#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%24 = stream.binding.subspan %arg3[%20#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%25 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%26 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%27 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%28 = tensor.empty() : tensor<20x4096x64xf16> | |
%29 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%25, %26, %27, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%28 : tensor<20x4096x64xf16>) { | |
^bb0(%arg12: f32): | |
iree_linalg_ext.yield %arg12 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %29, %24, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%c0_0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i64_1 = arith.constant 0 : i64 | |
%c0_i32_2 = arith.constant 0 : i32 | |
%c0_i64_3 = arith.constant 0 : i64 | |
%c0_i32_4 = arith.constant 0 : i32 | |
%c32_i64_5 = arith.constant 32 : i64 | |
%c0_i64_6 = arith.constant 0 : i64 | |
%c0_i32_7 = arith.constant 0 : i32 | |
%c0_i64_8 = arith.constant 0 : i64 | |
%c0_i32_9 = arith.constant 0 : i32 | |
%c32_i64_10 = arith.constant 32 : i64 | |
%c0_i64_11 = arith.constant 0 : i64 | |
%c0_i32_12 = arith.constant 0 : i32 | |
%c0_i64_13 = arith.constant 0 : i64 | |
%c0_i32_14 = arith.constant 0 : i32 | |
%c32_i64_15 = arith.constant 32 : i64 | |
%c0_i64_16 = arith.constant 0 : i64 | |
%c0_i32_17 = arith.constant 0 : i32 | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32_2, %c0_i32_4, %c0_i32_7, %c0_i32_9, %c0_i32_12, %c0_i32_14, %c0_i32_17 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0_0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg4 : i32 to i64 | |
%1 = arith.extui %arg5 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg6 : i32 to i64 | |
%6 = arith.extui %arg7 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg8 : i32 to i64 | |
%11 = arith.extui %arg9 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15 = arith.extui %arg10 : i32 to i64 | |
%16 = arith.extui %arg11 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 {stream.values = [0 : index]} : i64 to index | |
%20:4 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0>, | |
%19<umin = 0, umax = 0> | |
: index, index, index, index | |
%21 = stream.binding.subspan %arg0[%20#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%22 = stream.binding.subspan %arg1[%20#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%23 = stream.binding.subspan %arg2[%20#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%24 = stream.binding.subspan %arg3[%20#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%25 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%26 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%27 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%28 = tensor.empty() : tensor<20x4096x64xf16> | |
%29 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%25, %26, %27, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%28 : tensor<20x4096x64xf16>) { | |
^bb0(%arg12: f32): | |
iree_linalg_ext.yield %arg12 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %29, %24, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg4 : i32 to i64 | |
%1 = arith.extui %arg5 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg6 : i32 to i64 | |
%6 = arith.extui %arg7 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg8 : i32 to i64 | |
%11 = arith.extui %arg9 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15 = arith.extui %arg10 : i32 to i64 | |
%16 = arith.extui %arg11 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 {stream.values = [0 : index]} : i64 to index | |
%20:4 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0>, | |
%19<umin = 0, umax = 0> | |
: index, index, index, index | |
%21 = stream.binding.subspan %arg0[%20#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%22 = stream.binding.subspan %arg1[%20#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%23 = stream.binding.subspan %arg2[%20#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%24 = stream.binding.subspan %arg3[%20#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%25 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%26 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%27 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%28 = tensor.empty() : tensor<20x4096x64xf16> | |
%29 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%25, %26, %27, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%28 : tensor<20x4096x64xf16>) { | |
^bb0(%arg12: f32): | |
iree_linalg_ext.yield %arg12 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %29, %24, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg4 : i32 to i64 | |
%1 = arith.extui %arg5 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %arg6 : i32 to i64 | |
%6 = arith.extui %arg7 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %arg8 : i32 to i64 | |
%11 = arith.extui %arg9 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15 = arith.extui %arg10 : i32 to i64 | |
%16 = arith.extui %arg11 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 {stream.values = [0 : index]} : i64 to index | |
%20:4 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0>, | |
%19<umin = 0, umax = 0> | |
: index, index, index, index | |
%21 = stream.binding.subspan %arg0[%20#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%22 = stream.binding.subspan %arg1[%20#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%23 = stream.binding.subspan %arg2[%20#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%24 = stream.binding.subspan %arg3[%20#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%25 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%26 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%27 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%28 = tensor.empty() : tensor<20x4096x64xf16> | |
%29 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%25, %26, %27, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%28 : tensor<20x4096x64xf16>) { | |
^bb0(%arg12: f32): | |
iree_linalg_ext.yield %arg12 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %29, %24, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32, i32, i32) { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %c0_i32 : i32 to i64 | |
%1 = arith.extui %c0_i32 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = arith.extui %c0_i32 : i32 to i64 | |
%6 = arith.extui %c0_i32 : i32 to i64 | |
%7 = arith.shli %6, %c32_i64 : i64 | |
%8 = arith.ori %5, %7 : i64 | |
%9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index | |
%10 = arith.extui %c0_i32 : i32 to i64 | |
%11 = arith.extui %c0_i32 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index | |
%15 = arith.extui %c0_i32 : i32 to i64 | |
%16 = arith.extui %c0_i32 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 {stream.values = [0 : index]} : i64 to index | |
%20:4 = util.assume.int | |
%4<umin = 0, umax = 0>, | |
%9<umin = 0, umax = 0>, | |
%14<umin = 0, umax = 0>, | |
%19<umin = 0, umax = 0> | |
: index, index, index, index | |
%21 = stream.binding.subspan %arg0[%20#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%22 = stream.binding.subspan %arg1[%20#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%23 = stream.binding.subspan %arg2[%20#2] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%24 = stream.binding.subspan %arg3[%20#3] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%25 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%26 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%27 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%28 = tensor.empty() : tensor<20x4096x64xf16> | |
%29 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%25, %26, %27, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%28 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %29, %24, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
stream.executable private @attention_dispatch_0 { | |
stream.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable private @attention_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@embedded_elf_x86_64::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable private @attention_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
util.func public @attention(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @attention(%input0: tensor<20x4096x64xf16>, %input1: tensor<20x4096x64xf16>, %input2: tensor<20x4096x64xf16>) -> (%output0: tensor<20x4096x64xf16>)"}} { | |
%c10485760 = arith.constant 10485760 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c20 = arith.constant 20 : index | |
%element_type_f16 = hal.element_type<f16> : i32 | |
%dense_row_major = hal.encoding_type<dense_row_major> : i32 | |
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%0 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%1 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%c20, %c4096, %c64]) type(%element_type_f16) encoding(%dense_row_major) | |
%2 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg2 : !hal.buffer_view -> tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c10485760} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%0 as %arg3: !stream.resource<external>{%c10485760}, %1 as %arg4: !stream.resource<external>{%c10485760}, %2 as %arg5: !stream.resource<external>{%c10485760}, %result as %arg6: !stream.resource<external>{%c10485760}) { | |
stream.cmd.dispatch @attention_dispatch_0::@embedded_elf_x86_64::@attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store { | |
ro %arg3[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg4[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
ro %arg5[%c0 for %c10485760] : !stream.resource<external>{%c10485760}, | |
wo %arg6[%c0 for %c10485760] : !stream.resource<external>{%c10485760} | |
} | |
} => !stream.timepoint | |
%4 = stream.timepoint.await %3 => %result : !stream.resource<external>{%c10485760} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<20x4096x64xf16> in !stream.resource<external>{%c10485760} -> !hal.buffer_view | |
util.return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After MaterializeDeviceEncodingPass (iree-codegen-materialize-device-encoding) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // | |
module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable private @attention_dispatch_0 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<20x4096x64xf16> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x64x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x64xf16>, f16) outs(%extracted_slice_2 : tensor<1x64x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<1x64x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, %c0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x64x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x64xf16>, f16) outs(%extracted_slice_2 : tensor<1x64x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<1x64x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x64x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x64xf16>, f16) outs(%extracted_slice_2 : tensor<1x64x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<1x64x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x64x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x64xf16>, f16) outs(%extracted_slice_2 : tensor<1x64x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<1x64x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x64x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x64xf16>, f16) outs(%extracted_slice_2 : tensor<1x64x64xf16>) { | |
^bb0(%arg3: f32): | |
iree_linalg_ext.yield %arg3 : f32 | |
} -> tensor<1x64x64xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice_2) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3, 0] [1, 1, 64] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %arg5] [1, 4096, 8] [1, 1, 1] : tensor<1x4096x64xf16> to tensor<1x4096x8xf16> | |
%extracted_slice_5 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%11 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice_3, %extracted_slice_0, %extracted_slice_4, %cst : tensor<1x1x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x8xf16>, f16) outs(%extracted_slice_5 : tensor<1x1x8xf16>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<1x1x8xf16> | |
%inserted_slice = tensor.insert_slice %11 into %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf16> into tensor<1x64x64xf16> | |
scf.yield %inserted_slice : tensor<1x64x64xf16> | |
} | |
scf.yield %10 : tensor<1x64x64xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice_2) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3, 0] [1, 1, 64] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %arg5] [1, 4096, 8] [1, 1, 1] : tensor<1x4096x64xf16> to tensor<1x4096x8xf16> | |
%extracted_slice_5 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%11 = tensor.empty() : tensor<1x1x8xf32> | |
%12 = tensor.empty() : tensor<1x1xf32> | |
%cst_6 = arith.constant 0.000000e+00 : f32 | |
%cst_7 = arith.constant -3.40282347E+38 : f32 | |
%cst_8 = arith.constant 0.000000e+00 : f32 | |
%13 = linalg.fill ins(%cst_6 : f32) outs(%11 : tensor<1x1x8xf32>) -> tensor<1x1x8xf32> | |
%14 = linalg.fill ins(%cst_7 : f32) outs(%12 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%15 = linalg.fill ins(%cst_8 : f32) outs(%12 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%16:3 = iree_linalg_ext.online_attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice_3, %extracted_slice_0, %extracted_slice_4, %cst : tensor<1x1x64xf16>, tensor<1x4096x64xf16>, tensor<1x4096x8xf16>, f16) outs(%13, %14, %15 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
^bb0(%arg7: f32): | |
iree_linalg_ext.yield %arg7 : f32 | |
} -> tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%16#2, %16#0 : tensor<1x1xf32>, tensor<1x1x8xf32>) outs(%extracted_slice_5 : tensor<1x1x8xf16>) { | |
^bb0(%in: f32, %in_9: f32, %out: f16): | |
%cst_10 = arith.constant 1.000000e+00 : f32 | |
%18 = arith.divf %cst_10, %in : f32 | |
%19 = arith.mulf %18, %in_9 : f32 | |
%20 = arith.truncf %19 : f32 to f16 | |
linalg.yield %20 : f16 | |
} -> tensor<1x1x8xf16> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf16> into tensor<1x64x64xf16> | |
scf.yield %inserted_slice : tensor<1x64x64xf16> | |
} | |
scf.yield %10 : tensor<1x64x64xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTilePass (iree-llvmcpu-tile) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c4096 = arith.constant 4096 : index | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %5[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %6[%arg0, 0, 0] [1, 4096, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x4096x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice_5) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg3, 0] [1, 1, 64] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, 0, %arg5] [1, 4096, 8] [1, 1, 1] : tensor<1x4096x64xf16> to tensor<1x4096x8xf16> | |
%extracted_slice_8 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%11 = tensor.empty() : tensor<1x1x8xf32> | |
%12 = tensor.empty() : tensor<1x1xf32> | |
%13 = linalg.fill ins(%cst_1 : f32) outs(%11 : tensor<1x1x8xf32>) -> tensor<1x1x8xf32> | |
%14 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%15 = linalg.fill ins(%cst_1 : f32) outs(%12 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%16:3 = scf.for %arg7 = %c0 to %c4096 step %c8 iter_args(%arg8 = %13, %arg9 = %14, %arg10 = %15) -> (tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_3[0, %arg7, 0] [1, 8, 64] [1, 1, 1] : tensor<1x4096x64xf16> to tensor<1x8x64xf16> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_7[0, %arg7, 0] [1, 8, 8] [1, 1, 1] : tensor<1x4096x8xf16> to tensor<1x8x8xf16> | |
%18:3 = iree_linalg_ext.online_attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice_6, %extracted_slice_9, %extracted_slice_10, %cst_2 : tensor<1x1x64xf16>, tensor<1x8x64xf16>, tensor<1x8x8xf16>, f16) outs(%arg8, %arg9, %arg10 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
^bb0(%arg11: f32): | |
iree_linalg_ext.yield %arg11 : f32 | |
} -> tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
scf.yield %18#0, %18#1, %18#2 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
} | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%16#2, %16#0 : tensor<1x1xf32>, tensor<1x1x8xf32>) outs(%extracted_slice_8 : tensor<1x1x8xf16>) { | |
^bb0(%in: f32, %in_9: f32, %out: f16): | |
%18 = arith.divf %cst, %in : f32 | |
%19 = arith.mulf %18, %in_9 : f32 | |
%20 = arith.truncf %19 : f32 to f16 | |
linalg.yield %20 : f16 | |
} -> tensor<1x1x8xf16> | |
%inserted_slice = tensor.insert_slice %17 into %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf16> into tensor<1x64x64xf16> | |
scf.yield %inserted_slice : tensor<1x64x64xf16> | |
} | |
scf.yield %10 : tensor<1x64x64xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeWinogradTransformPass (iree-linalg-ext-decompose-winograd) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c4096 = arith.constant 4096 : index | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg1] | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %11, 0] [1, 1, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%12 = tensor.empty() : tensor<1x1x8xf32> | |
%13 = tensor.empty() : tensor<1x1xf32> | |
%14 = linalg.fill ins(%cst_1 : f32) outs(%12 : tensor<1x1x8xf32>) -> tensor<1x1x8xf32> | |
%15 = linalg.fill ins(%cst_0 : f32) outs(%13 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%16 = linalg.fill ins(%cst_1 : f32) outs(%13 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%17:3 = scf.for %arg7 = %c0 to %c4096 step %c8 iter_args(%arg8 = %14, %arg9 = %15, %arg10 = %16) -> (tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
%extracted_slice_5 = tensor.extract_slice %5[%arg0, %arg7, 0] [1, 8, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %6[%arg0, %arg7, %arg5] [1, 8, 8] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x8xf16> | |
%19:3 = iree_linalg_ext.online_attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> ()>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>], lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 0, 0, 64], [1, 1, 0, 0, 8], [0, 0, 0, 8, 0]]>} ins(%extracted_slice_3, %extracted_slice_5, %extracted_slice_6, %cst_2 : tensor<1x1x64xf16>, tensor<1x8x64xf16>, tensor<1x8x8xf16>, f16) outs(%arg8, %arg9, %arg10 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
^bb0(%arg11: f32): | |
iree_linalg_ext.yield %arg11 : f32 | |
} -> tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
scf.yield %19#0, %19#1, %19#2 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
} | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17#2, %17#0 : tensor<1x1xf32>, tensor<1x1x8xf32>) outs(%extracted_slice_4 : tensor<1x1x8xf16>) { | |
^bb0(%in: f32, %in_5: f32, %out: f16): | |
%19 = arith.divf %cst, %in : f32 | |
%20 = arith.mulf %19, %in_5 : f32 | |
%21 = arith.truncf %20 : f32 to f16 | |
linalg.yield %21 : f16 | |
} -> tensor<1x1x8xf16> | |
%inserted_slice = tensor.insert_slice %18 into %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf16> into tensor<1x64x64xf16> | |
scf.yield %inserted_slice : tensor<1x64x64xf16> | |
} | |
scf.yield %10 : tensor<1x64x64xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%c4096 = arith.constant 4096 : index | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg1] | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %11, 0] [1, 1, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%12 = tensor.empty() : tensor<1x1x8xf32> | |
%13 = tensor.empty() : tensor<1x1xf32> | |
%14 = linalg.fill ins(%cst_1 : f32) outs(%12 : tensor<1x1x8xf32>) -> tensor<1x1x8xf32> | |
%15 = linalg.fill ins(%cst_0 : f32) outs(%13 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%16 = linalg.fill ins(%cst_1 : f32) outs(%13 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%17:3 = scf.for %arg7 = %c0 to %c4096 step %c8 iter_args(%arg8 = %14, %arg9 = %15, %arg10 = %16) -> (tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
%extracted_slice_5 = tensor.extract_slice %5[%arg0, %arg7, 0] [1, 8, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %6[%arg0, %arg7, %arg5] [1, 8, 8] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x8xf16> | |
%cst_7 = arith.constant 1.442380e+00 : f16 | |
%19 = arith.mulf %cst_2, %cst_7 : f16 | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%19 : f16) outs(%extracted_slice_3 : tensor<1x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%34 = arith.mulf %in, %out : f16 | |
linalg.yield %34 : f16 | |
} -> tensor<1x1x64xf16> | |
%21 = tensor.empty() : tensor<1x1x8xf32> | |
%cst_8 = arith.constant 0.000000e+00 : f32 | |
%22 = linalg.fill ins(%cst_8 : f32) outs(%21 : tensor<1x1x8xf32>) -> tensor<1x1x8xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%20, %extracted_slice_5 : tensor<1x1x64xf16>, tensor<1x8x64xf16>) outs(%22 : tensor<1x1x8xf32>) { | |
^bb0(%in: f16, %in_9: f16, %out: f32): | |
%34 = arith.extf %in : f16 to f32 | |
%35 = arith.extf %in_9 : f16 to f32 | |
%36 = arith.mulf %34, %35 : f32 | |
%37 = arith.addf %36, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x8xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%23 : tensor<1x1x8xf32>) { | |
^bb0(%out: f32): | |
linalg.yield %out : f32 | |
} -> tensor<1x1x8xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%24 : tensor<1x1x8xf32>) outs(%arg9 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.maximumf %in, %out : f32 | |
linalg.yield %34 : f32 | |
} -> tensor<1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25 : tensor<1x1xf32>) outs(%arg9 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.subf %out, %in : f32 | |
%35 = math.exp2 %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%26 : tensor<1x1xf32>) outs(%arg10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.mulf %in, %out : f32 | |
linalg.yield %34 : f32 | |
} -> tensor<1x1xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<1x1xf32>) outs(%24 : tensor<1x1x8xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.subf %out, %in : f32 | |
%35 = math.exp2 %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<1x1x8xf32> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%28 : tensor<1x1x8xf32>) outs(%27 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.addf %in, %out : f32 | |
linalg.yield %34 : f32 | |
} -> tensor<1x1xf32> | |
%30 = tensor.empty() : tensor<1x1x8xf16> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%28 : tensor<1x1x8xf32>) outs(%30 : tensor<1x1x8xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%34 = arith.truncf %in : f32 to f16 | |
linalg.yield %34 : f16 | |
} -> tensor<1x1x8xf16> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26 : tensor<1x1xf32>) outs(%arg8 : tensor<1x1x8xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%34 = arith.mulf %in, %out : f32 | |
linalg.yield %34 : f32 | |
} -> tensor<1x1x8xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>], iterator_types = ["parallel", "parallel", "reduction", "parallel"]} ins(%31, %extracted_slice_6 : tensor<1x1x8xf16>, tensor<1x8x8xf16>) outs(%32 : tensor<1x1x8xf32>) { | |
^bb0(%in: f16, %in_9: f16, %out: f32): | |
%34 = arith.extf %in : f16 to f32 | |
%35 = arith.extf %in_9 : f16 to f32 | |
%36 = arith.mulf %34, %35 : f32 | |
%37 = arith.addf %36, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x8xf32> | |
scf.yield %33, %25, %29 : tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32> | |
} | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17#2, %17#0 : tensor<1x1xf32>, tensor<1x1x8xf32>) outs(%extracted_slice_4 : tensor<1x1x8xf16>) { | |
^bb0(%in: f32, %in_5: f32, %out: f16): | |
%19 = arith.divf %cst, %in : f32 | |
%20 = arith.mulf %19, %in_5 : f32 | |
%21 = arith.truncf %20 : f32 to f16 | |
linalg.yield %21 : f16 | |
} -> tensor<1x1x8xf16> | |
%inserted_slice = tensor.insert_slice %18 into %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf16> into tensor<1x64x64xf16> | |
scf.yield %inserted_slice : tensor<1x64x64xf16> | |
} | |
scf.yield %10 : tensor<1x64x64xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<1x64x64xf16> into tensor<20x4096x64xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @attention_dispatch_0_attention_20x4096x64xf16_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<8x1x1xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<1x1x64xf16> | |
%cst_1 = arith.constant 0.000000e+00 : f16 | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<1x1xf32> | |
%cst_3 = arith.constant dense<-3.40282347E+38> : vector<1x1xf32> | |
%cst_4 = arith.constant dense<0.000000e+00> : vector<1x1x8xf32> | |
%c4096 = arith.constant 4096 : index | |
%cst_5 = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>> | |
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16> | |
%7 = tensor.empty() : tensor<20x4096x64xf16> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (20, 4096) step (1, 64) shared_outs(%arg2 = %7) -> (tensor<20x4096x64xf16>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0] [1, 64, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x64x64xf16> | |
%9 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %extracted_slice) -> (tensor<1x64x64xf16>) { | |
%10 = scf.for %arg5 = %c0 to %c64 step %c8 iter_args(%arg6 = %arg4) -> (tensor<1x64x64xf16>) { | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg1] | |
%extracted_slice_6 = tensor.extract_slice %4[%arg0, %11, 0] [1, 1, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x1x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %arg6[0, %arg3, %arg5] [1, 1, 8] [1, 1, 1] : tensor<1x64x64xf16> to tensor<1x1x8xf16> | |
%12 = tensor.empty() : tensor<1x1x8xf32> | |
%13 = tensor.empty() : tensor<1x1xf32> | |
%14 = vector.transfer_write %cst_4, %12[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x8xf32>, tensor<1x1x8xf32> | |
%15 = vector.transfer_write %cst_3, %13[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32> | |
%16 = vector.transfer_write %cst_2, %13[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32> | |
%17:3 = scf.for %arg7 = %c0 to %c4096 step %c8 iter_args(%arg8 = %14, %arg9 = %15, %arg10 = %16) -> (tensor<1x1x8xf32>, tensor<1x1xf32>, tensor<1x1xf32>) { | |
%extracted_slice_8 = tensor.extract_slice %5[%arg0, %arg7, 0] [1, 8, 64] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %6[%arg0, %arg7, %arg5] [1, 8, 8] [1, 1, 1] : tensor<20x4096x64xf16> to tensor<1x8x8xf16> | |
%26 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : tensor<1x1x64xf16>, vector<1x1x64xf16> | |
%27 = arith.mulf %26, %cst_0 : vector<1x1x64xf16> | |
%28 = vector.transfer_read %extracted_slice_8[%c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true]} : tensor<1x8x64xf16>, vector<1x8x64xf16> | |
%29 = arith.extf %27 : vector<1x1x64xf16> to vector<1x1x64xf32> | |
%30 = arith.extf %28 : vector<1x8x64xf16> to vector<1x8x64xf32> | |
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>], iterator_types = ["parallel", "parallel", "reduction", "parallel"], kind = #vector.kind<add>} %29, %30, %cst_4 : vector<1x1x64xf32>, vector<1x8x64xf32> into vector<1x1x8xf32> | |
%32 = vector.transfer_read %arg9[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> | |
%33 = vector.multi_reduction <maximumf>, %31, %32 [2] : vector<1x1x8xf32> to vector<1x1xf32> | |
%34 = vector.transfer_write %33, %arg9[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32> | |
%35 = vector.transfer_read %arg9[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> | |
%36 = arith.subf %35, %33 : vector<1x1xf32> | |
%37 = math.exp2 %36 : vector<1x1xf32> | |
%38 = vector.transfer_read %arg10[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32> | |
%39 = arith.mulf %37, %38 : vector<1x1xf32> | |
%40 = vector.broadcast %33 : vector<1x1xf32> to vector<8x1x1xf32> | |
%41 = vector.transpose %40, [1, 2, 0] : vector<8x1x1xf32> to vector<1x1x8xf32> | |
%42 = arith.subf %31, %41 : vector<1x1x8xf32> | |
%43 = math.exp2 %42 : vector<1x1x8xf32> | |
%44 = vector.multi_reduction <add>, %43, %39 [2] : vector<1x1x8xf32> to vector<1x1xf32> | |
%45 = vector.transfer_write %44, %arg10[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32> | |
%46 = arith.truncf %43 : vector<1x1x8xf32> to vector<1x1x8xf16> | |
%47 = vector.broadcast %37 : vector<1x1xf32> to vector<8x1x1xf32> | |
%48 = vector.transpose %47, [1, 2, 0] : vector<8x1x1xf32> to vector<1x1x8xf32> | |
%49 = vector.transfer_read %arg8[%c0, %c0, %c0], %cst_5 {in_bounds = [true, true, true]} : tensor<1x1x8xf32>, vector<1x1x8xf32> | |
%50 = arith.mulf %48, %49 : vector<1x1x8xf32> | |
%51 = vector.transfer_read %extracted_slice_9[%c0, %c0, %c0], %cst_1 {in_bounds = [true, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment