Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created January 6, 2025 06:39
Show Gist options
  • Save pashu123/2a162391c5212dc7351a08d0748833fd to your computer and use it in GitHub Desktop.
Save pashu123/2a162391c5212dc7351a08d0748833fd to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {hal.device.targets = [#device_target_hip]} {
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
}
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- //
module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable public @run_forward$async_dispatch_46 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
}
// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) {
^bb0(%arg0: f32):
iree_linalg_ext.yield %arg0 : f32
} -> tensor<2x10x4096x64xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<2x4096x10x64xf16>
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_0 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_1 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf16>
%extracted_slice_2 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%extracted_slice_2 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_0 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_1 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf16>
%extracted_slice_2 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%extracted_slice_2 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_0 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_1 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf16>
%extracted_slice_2 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%extracted_slice_2 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ReorderWorkgroupsPass (iree-codegen-reorder-workgroups) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_0 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_1 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf16>
%extracted_slice_2 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%extracted_slice_2 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_0 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_1 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%cst_2 = arith.constant 0.000000e+00 : f32
%cst_3 = arith.constant -3.40282347E+38 : f32
%cst_4 = arith.constant 0.000000e+00 : f32
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_3 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_4 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_6: f32, %out: f16):
%cst_7 = arith.constant 1.000000e+00 : f32
%24 = arith.divf %cst_7, %in : f32
%25 = arith.mulf %24, %in_6 : f32
%26 = arith.truncf %25 : f32 to f16
linalg.yield %26 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%extracted_slice_5 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_4 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_3, %extracted_slice_4, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_6: f32, %out: f16):
%24 = arith.divf %cst, %in : f32
%25 = arith.mulf %24, %in_6 : f32
%26 = arith.truncf %25 : f32 to f16
linalg.yield %26 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%extracted_slice_5 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_4 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%extracted_slice, %extracted_slice_3, %extracted_slice_4, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_6: f32, %out: f16):
%24 = arith.divf %cst, %in : f32
%25 = arith.mulf %24, %in_6 : f32
%26 = arith.truncf %25 : f32 to f16
linalg.yield %26 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%extracted_slice_5 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %12[%arg0, %arg1, 0, 0] [1, 1, 4096, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x4096x64xf16>
%extracted_slice_4 = tensor.extract_slice %10[%arg0, %arg1, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x4096xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<1x1x64x64xf16>
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%23 = tensor.empty() : tensor<1x1x4096x64xf16>
%24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x4096x64xf16>) outs(%23 : tensor<1x1x4096x64xf16>) -> tensor<1x1x4096x64xf16>
%25 = tensor.empty() : tensor<1x1x64x4096xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x4096xf16>) outs(%25 : tensor<1x1x64x4096xf16>) -> tensor<1x1x64x4096xf16>
%27:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %24, %26, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg4: f32):
iree_linalg_ext.yield %arg4 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27#2, %27#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_6: f32, %out: f16):
%30 = arith.divf %cst, %in : f32
%31 = arith.mulf %30, %in_6 : f32
%32 = arith.truncf %31 : f32 to f16
linalg.yield %32 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%28 : tensor<1x1x64x64xf16>) outs(%extracted_slice_5 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %29 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<1x1x64x64xf16>
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%23:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%26 = tensor.empty() : tensor<1x1x64x64xf16>
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%28 = tensor.empty() : tensor<1x1x64x64xf16>
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg5, %arg6, %arg7 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg8: f32):
iree_linalg_ext.yield %arg8 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%26 = arith.divf %cst, %in : f32
%27 = arith.mulf %26, %in_4 : f32
%28 = arith.truncf %27 : f32 to f16
linalg.yield %28 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<1x1x64x64xf16>
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%23:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%26 = tensor.empty() : tensor<1x1x64x64xf16>
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%28 = tensor.empty() : tensor<1x1x64x64xf16>
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg5, %arg6, %arg7 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg8: f32):
iree_linalg_ext.yield %arg8 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%26 = arith.divf %cst, %in : f32
%27 = arith.mulf %26, %in_4 : f32
%28 = arith.truncf %27 : f32 to f16
linalg.yield %28 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<1x1x64x64xf16>
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%23:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%26 = tensor.empty() : tensor<1x1x64x64xf16>
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%28 = tensor.empty() : tensor<1x1x64x64xf16>
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg5, %arg6, %arg7 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg8: f32):
iree_linalg_ext.yield %arg8 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%26 = arith.divf %cst, %in : f32
%27 = arith.mulf %26, %in_4 : f32
%28 = arith.truncf %27 : f32 to f16
linalg.yield %28 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%27:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%21, %25, %26, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg5, %arg6, %arg7 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
^bb0(%arg8: f32):
iree_linalg_ext.yield %arg8 : f32
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
scf.yield %27#0, %27#1, %27#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%25 = arith.divf %cst, %in : f32
%26 = arith.mulf %25, %in_4 : f32
%27 = arith.truncf %26 : f32 to f16
linalg.yield %27 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant -3.40282347E+38 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 1.250000e-01 : f16
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%cst_6 = arith.constant 1.442380e+00 : f16
%27 = arith.mulf %cst_2, %cst_6 : f16
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : f16) outs(%21 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%42 = arith.mulf %in, %out : f16
linalg.yield %42 : f16
} -> tensor<1x1x64x64xf16>
%29 = tensor.empty() : tensor<1x1x64x64xf32>
%cst_7 = arith.constant 0.000000e+00 : f32
%30 = linalg.fill ins(%cst_7 : f32) outs(%29 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%28, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%30 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_8: f16, %out: f32):
%42 = arith.extf %in : f16 to f32
%43 = arith.extf %in_8 : f16 to f32
%44 = arith.mulf %42, %43 : f32
%45 = arith.addf %44, %out : f32
linalg.yield %45 : f32
} -> tensor<1x1x64x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%31 : tensor<1x1x64x64xf32>) {
^bb0(%out: f32):
linalg.yield %out : f32
} -> tensor<1x1x64x64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.maximumf %in, %out : f32
linalg.yield %42 : f32
} -> tensor<1x1x64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.subf %out, %in : f32
%43 = math.exp2 %42 : f32
linalg.yield %43 : f32
} -> tensor<1x1x64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.mulf %in, %out : f32
linalg.yield %42 : f32
} -> tensor<1x1x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<1x1x64xf32>) outs(%32 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.subf %out, %in : f32
%43 = math.exp2 %42 : f32
linalg.yield %43 : f32
} -> tensor<1x1x64x64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%36 : tensor<1x1x64x64xf32>) outs(%35 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.addf %in, %out : f32
linalg.yield %42 : f32
} -> tensor<1x1x64xf32>
%38 = tensor.empty() : tensor<1x1x64x64xf16>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%42 = arith.truncf %in : f32 to f16
linalg.yield %42 : f16
} -> tensor<1x1x64x64xf16>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%42 = arith.mulf %in, %out : f32
linalg.yield %42 : f32
} -> tensor<1x1x64x64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%39, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%40 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_8: f16, %out: f32):
%42 = arith.extf %in : f16 to f32
%43 = arith.extf %in_8 : f16 to f32
%44 = arith.mulf %42, %43 : f32
%45 = arith.addf %44, %out : f32
linalg.yield %45 : f32
} -> tensor<1x1x64x64xf32>
scf.yield %41, %33, %37 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%25 = arith.divf %cst, %in : f32
%26 = arith.mulf %25, %in_4 : f32
%27 = arith.truncf %26 : f32 to f16
linalg.yield %27 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%21 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%40 = arith.mulf %in, %out : f16
linalg.yield %40 : f16
} -> tensor<1x1x64x64xf16>
%28 = tensor.empty() : tensor<1x1x64x64xf32>
%29 = linalg.fill ins(%cst_2 : f32) outs(%28 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%27, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%29 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%40 = arith.extf %in : f16 to f32
%41 = arith.extf %in_6 : f16 to f32
%42 = arith.mulf %40, %41 : f32
%43 = arith.addf %42, %out : f32
linalg.yield %43 : f32
} -> tensor<1x1x64x64xf32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%30 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.maximumf %in, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%31 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.subf %out, %in : f32
%41 = math.exp2 %40 : f32
linalg.yield %41 : f32
} -> tensor<1x1x64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.mulf %in, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : tensor<1x1x64xf32>) outs(%30 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.subf %out, %in : f32
%41 = math.exp2 %40 : f32
linalg.yield %41 : f32
} -> tensor<1x1x64x64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%34 : tensor<1x1x64x64xf32>) outs(%33 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.addf %in, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64xf32>
%36 = tensor.empty() : tensor<1x1x64x64xf16>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64x64xf32>) outs(%36 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%40 = arith.truncf %in : f32 to f16
linalg.yield %40 : f16
} -> tensor<1x1x64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%40 = arith.mulf %in, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64x64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%37, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%38 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%40 = arith.extf %in : f16 to f32
%41 = arith.extf %in_6 : f16 to f32
%42 = arith.mulf %40, %41 : f32
%43 = arith.addf %42, %out : f32
linalg.yield %43 : f32
} -> tensor<1x1x64x64xf32>
scf.yield %39, %31, %35 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%25 = arith.divf %cst_0, %in : f32
%26 = arith.mulf %25, %in_4 : f32
%27 = arith.truncf %26 : f32 to f16
linalg.yield %27 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%21 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%37 = arith.mulf %in, %out : f16
linalg.yield %37 : f16
} -> tensor<1x1x64x64xf16>
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%27, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%18 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%37 = arith.extf %in : f16 to f32
%38 = arith.extf %in_6 : f16 to f32
%39 = arith.mulf %37, %38 : f32
%40 = arith.addf %39, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64x64xf32>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%28 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.maximumf %in, %out : f32
linalg.yield %37 : f32
} -> tensor<1x1x64xf32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%29 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.subf %out, %in : f32
%38 = math.exp2 %37 : f32
linalg.yield %38 : f32
} -> tensor<1x1x64xf32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.mulf %in, %out : f32
linalg.yield %37 : f32
} -> tensor<1x1x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29 : tensor<1x1x64xf32>) outs(%28 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.subf %out, %in : f32
%38 = math.exp2 %37 : f32
linalg.yield %38 : f32
} -> tensor<1x1x64x64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%31 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.addf %in, %out : f32
linalg.yield %37 : f32
} -> tensor<1x1x64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%37 = arith.truncf %in : f32 to f16
linalg.yield %37 : f16
} -> tensor<1x1x64x64xf16>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%37 = arith.mulf %in, %out : f32
linalg.yield %37 : f32
} -> tensor<1x1x64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%34, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%35 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%37 = arith.extf %in : f16 to f32
%38 = arith.extf %in_6 : f16 to f32
%39 = arith.mulf %37, %38 : f32
%40 = arith.addf %39, %out : f32
linalg.yield %40 : f32
} -> tensor<1x1x64x64xf32>
scf.yield %36, %29, %33 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%25 = arith.divf %cst_0, %in : f32
%26 = arith.mulf %25, %in_4 : f32
%27 = arith.truncf %26 : f32 to f16
linalg.yield %27 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After LLVMGPUConfigureTensorLayoutsPass (iree-llvmgpu-configure-tensor-layouts) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%23:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%28 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%48 = arith.mulf %in, %out : f16
linalg.yield %48 : f16
} -> tensor<1x1x64x64xf16>
%31 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%32 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%33 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%31, %32 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%33 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.maximumf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64x64xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.addf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%48 = arith.truncf %in : f32 to f16
linalg.yield %48 : f16
} -> tensor<1x1x64x64xf16>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64x64xf32>
%43 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16>
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%26 = arith.divf %cst_0, %in : f32
%27 = arith.mulf %26, %in_4 : f32
%28 = arith.truncf %27 : f32 to f16
linalg.yield %28 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32>
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%29 = arith.mulf %in, %out : f16
linalg.yield %29 : f16
} -> tensor<1x1x64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%25 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%31 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16>
%32 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%33 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%33, %24 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%25 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.maximumf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64x64xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.addf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%48 = arith.truncf %in : f32 to f16
linalg.yield %48 : f16
} -> tensor<1x1x64x64xf16>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64x64xf32>
%43 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16>
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26#2, %26#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%29 = arith.divf %cst_0, %in : f32
%30 = arith.mulf %29, %in_4 : f32
%31 = arith.truncf %30 : f32 to f16
linalg.yield %31 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %28 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After LinalgGeneralizeNamedOpsPass (linalg-generalize-named-ops) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64x64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%29 = arith.mulf %in, %out : f16
linalg.yield %29 : f16
} -> tensor<1x1x64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%25 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_4 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%extracted_slice_5 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_5 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%32 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16>
%33 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%33, %24 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%25 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.maximumf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.subf %out, %in : f32
%49 = math.exp2 %48 : f32
linalg.yield %49 : f32
} -> tensor<1x1x64x64xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.addf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%48 = arith.truncf %in : f32 to f16
linalg.yield %48 : f16
} -> tensor<1x1x64x64xf16>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%48 = arith.mulf %in, %out : f32
linalg.yield %48 : f32
} -> tensor<1x1x64x64xf32>
%43 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16>
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16>
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_6: f16, %out: f32):
%48 = arith.extf %in : f16 to f32
%49 = arith.extf %in_6 : f16 to f32
%50 = arith.mulf %48, %49 : f32
%51 = arith.addf %50, %out : f32
linalg.yield %51 : f32
} -> tensor<1x1x64x64xf32>
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32>
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26#2, %26#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_4: f32, %out: f16):
%29 = arith.divf %cst_0, %in : f32
%30 = arith.mulf %29, %in_4 : f32
%31 = arith.truncf %30 : f32 to f16
linalg.yield %31 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<1x1x64x64xf16>) outs(%extracted_slice_3 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %28 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After VectorExtFoldUnitExtentDimsPass (iree-vector-ext-fold-unit-extent-dims) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf16>
%16 = tensor.empty() : tensor<1x1x64x64xf32>
%17 = tensor.empty() : tensor<1x1x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64x64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<1x1x64xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_3 = tensor.extract_slice %21[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%22 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%23 = tensor.empty() : tensor<1x1x64x64xf16>
%inserted_slice = tensor.insert_slice %22 into %23[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%inserted_slice : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
%32 = arith.mulf %in, %out : f16
linalg.yield %32 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_4 = tensor.extract_slice %24[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%25 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%26 = tensor.empty() : tensor<1x1x64x64xf16>
%inserted_slice_5 = tensor.insert_slice %25 into %26[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16>
%extracted_slice_6 = tensor.extract_slice %18[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%27 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%28 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_7 = tensor.insert_slice %27 into %28[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%29:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_9 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_9 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_10 = tensor.extract_slice %32[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%33 = iree_vector_ext.to_layout %extracted_slice_10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_11 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_12 = tensor.extract_slice %34[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%35 = iree_vector_ext.to_layout %extracted_slice_12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%36 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%37 = tensor.empty() : tensor<1x1x64x64xf16>
%inserted_slice_13 = tensor.insert_slice %36 into %37[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%inserted_slice_13, %inserted_slice_5 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%inserted_slice_7 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_23: f16, %out: f32):
%57 = arith.extf %in : f16 to f32
%58 = arith.extf %in_23 : f16 to f32
%59 = arith.mulf %57, %58 : f32
%60 = arith.addf %59, %out : f32
linalg.yield %60 : f32
} -> tensor<1x1x64x64xf32>
%extracted_slice_14 = tensor.extract_slice %38[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%39 = iree_vector_ext.to_layout %extracted_slice_14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%40 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_15 = tensor.insert_slice %39 into %40[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%inserted_slice_15 : tensor<1x1x64x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.maximumf %in, %out : f32
linalg.yield %57 : f32
} -> tensor<1x1x64xf32>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%41 : tensor<1x1x64xf32>) outs(%arg6 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.subf %out, %in : f32
%58 = math.exp2 %57 : f32
linalg.yield %58 : f32
} -> tensor<1x1x64xf32>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%42 : tensor<1x1x64xf32>) outs(%arg7 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.mulf %in, %out : f32
linalg.yield %57 : f32
} -> tensor<1x1x64xf32>
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%41 : tensor<1x1x64xf32>) outs(%inserted_slice_15 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.subf %out, %in : f32
%58 = math.exp2 %57 : f32
linalg.yield %58 : f32
} -> tensor<1x1x64x64xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%44 : tensor<1x1x64x64xf32>) outs(%43 : tensor<1x1x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.addf %in, %out : f32
linalg.yield %57 : f32
} -> tensor<1x1x64xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %out: f16):
%57 = arith.truncf %in : f32 to f16
linalg.yield %57 : f16
} -> tensor<1x1x64x64xf16>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<1x1x64xf32>) outs(%arg5 : tensor<1x1x64x64xf32>) {
^bb0(%in: f32, %out: f32):
%57 = arith.mulf %in, %out : f32
linalg.yield %57 : f32
} -> tensor<1x1x64x64xf32>
%48 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%49 = tensor.empty() : tensor<1x1x64x64xf16>
%inserted_slice_16 = tensor.insert_slice %48 into %49[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16>
%extracted_slice_17 = tensor.extract_slice %46[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%50 = iree_vector_ext.to_layout %extracted_slice_17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%51 = tensor.empty() : tensor<1x1x64x64xf16>
%inserted_slice_18 = tensor.insert_slice %50 into %51[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16>
%extracted_slice_19 = tensor.extract_slice %47[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%52 = iree_vector_ext.to_layout %extracted_slice_19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%53 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_20 = tensor.insert_slice %52 into %53[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%inserted_slice_16, %inserted_slice_18 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%inserted_slice_20 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} {
^bb0(%in: f16, %in_23: f16, %out: f32):
%57 = arith.extf %in : f16 to f32
%58 = arith.extf %in_23 : f16 to f32
%59 = arith.mulf %57, %58 : f32
%60 = arith.addf %59, %out : f32
linalg.yield %60 : f32
} -> tensor<1x1x64x64xf32>
%extracted_slice_21 = tensor.extract_slice %54[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%55 = iree_vector_ext.to_layout %extracted_slice_21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%56 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_22 = tensor.insert_slice %55 into %56[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
scf.yield %inserted_slice_22, %41, %45 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29#2, %29#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) {
^bb0(%in: f32, %in_9: f32, %out: f16):
%32 = arith.divf %cst_0, %in : f32
%33 = arith.mulf %32, %in_9 : f32
%34 = arith.truncf %33 : f32 to f16
linalg.yield %34 : f16
} -> tensor<1x1x64x64xf16>
%extracted_slice_8 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64x64xf16>) outs(%extracted_slice_8 : tensor<1x64x1x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %31 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf32>
%16 = tensor.empty() : tensor<1x1x64xf32>
%17 = tensor.empty() : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%19 = tensor.empty() : tensor<64xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<64xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%21 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_4 = tensor.insert_slice %22 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%23 = tensor.empty() : tensor<64x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_5 : tensor<64x64xf16>) outs(%23 : tensor<64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%26 = tensor.empty() : tensor<64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %25 : f16, tensor<64x64xf16>) outs(%26 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_11: f16, %out: f16):
%34 = arith.mulf %in, %in_11 : f16
linalg.yield %34 : f16
} -> tensor<64x64xf16>
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%29 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%30:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %inserted_slice, %arg6 = %inserted_slice_3, %arg7 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_11 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_12 = tensor.extract_slice %extracted_slice_11[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%34 = tensor.empty() : tensor<64x64xf16>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_12 : tensor<64x64xf16>) outs(%34 : tensor<64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x64xf16>
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_13 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%37 = tensor.empty() : tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_14 : tensor<64x64xf16>) outs(%37 : tensor<64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x64xf16>
%39 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%40 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%40, %28 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%29 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_22: f16, %out: f32):
%61 = arith.extf %in : f16 to f32
%62 = arith.extf %in_22 : f16 to f32
%63 = arith.mulf %61, %62 : f32
%64 = arith.addf %63, %out : f32
linalg.yield %64 : f32
} -> tensor<64x64xf32>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%43 = tensor.empty() : tensor<64x64xf32>
%extracted_slice_15 = tensor.extract_slice %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%42 : tensor<64x64xf32>) outs(%extracted_slice_15 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%61 = arith.maximumf %in, %out : f32
linalg.yield %61 : f32
} -> tensor<64xf32>
%inserted_slice_16 = tensor.insert_slice %44 into %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%extracted_slice_17 = tensor.extract_slice %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%45 = tensor.empty() : tensor<64xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%44, %extracted_slice_17 : tensor<64xf32>, tensor<64xf32>) outs(%45 : tensor<64xf32>) {
^bb0(%in: f32, %in_22: f32, %out: f32):
%61 = arith.subf %in_22, %in : f32
%62 = math.exp2 %61 : f32
linalg.yield %62 : f32
} -> tensor<64xf32>
%extracted_slice_18 = tensor.extract_slice %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%47 = tensor.empty() : tensor<64xf32>
%48 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%46, %extracted_slice_18 : tensor<64xf32>, tensor<64xf32>) outs(%47 : tensor<64xf32>) {
^bb0(%in: f32, %in_22: f32, %out: f32):
%61 = arith.mulf %in, %in_22 : f32
linalg.yield %61 : f32
} -> tensor<64xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %42 : tensor<64xf32>, tensor<64x64xf32>) outs(%43 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_22: f32, %out: f32):
%61 = arith.subf %in_22, %in : f32
%62 = math.exp2 %61 : f32
linalg.yield %62 : f32
} -> tensor<64x64xf32>
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%49 : tensor<64x64xf32>) outs(%48 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%61 = arith.addf %in, %out : f32
linalg.yield %61 : f32
} -> tensor<64xf32>
%inserted_slice_19 = tensor.insert_slice %50 into %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%51 = tensor.empty() : tensor<64x64xf16>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%49 : tensor<64x64xf32>) outs(%51 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%61 = arith.truncf %in : f32 to f16
linalg.yield %61 : f16
} -> tensor<64x64xf16>
%extracted_slice_20 = tensor.extract_slice %arg5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%53 = tensor.empty() : tensor<64x64xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%46, %extracted_slice_20 : tensor<64xf32>, tensor<64x64xf32>) outs(%53 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_22: f32, %out: f32):
%61 = arith.mulf %in, %in_22 : f32
linalg.yield %61 : f32
} -> tensor<64x64xf32>
%55 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%56 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%57 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%55, %56 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%57 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_22: f16, %out: f32):
%61 = arith.extf %in : f16 to f32
%62 = arith.extf %in_22 : f16 to f32
%63 = arith.mulf %61, %62 : f32
%64 = arith.addf %63, %out : f32
linalg.yield %64 : f32
} -> tensor<64x64xf32>
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%60 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_21 = tensor.insert_slice %59 into %60[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
scf.yield %inserted_slice_21, %inserted_slice_16, %inserted_slice_19 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%extracted_slice_6 = tensor.extract_slice %30#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%extracted_slice_7 = tensor.extract_slice %30#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%31 = tensor.empty() : tensor<64x64xf16>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<64xf32>, tensor<64x64xf32>) outs(%31 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_11: f32, %out: f16):
%34 = arith.divf %cst_0, %in : f32
%35 = arith.mulf %34, %in_11 : f32
%36 = arith.truncf %35 : f32 to f16
linalg.yield %36 : f16
} -> tensor<64x64xf16>
%extracted_slice_8 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> to tensor<64x64xf16>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<64x64xf16>) outs(%extracted_slice_9 : tensor<64x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x64xf16>
%inserted_slice_10 = tensor.insert_slice %33 into %extracted_slice_8[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice_10 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf32>
%16 = tensor.empty() : tensor<1x1x64xf32>
%17 = tensor.empty() : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%19 = tensor.empty() : tensor<64xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%21 = tensor.empty() : tensor<64xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%21 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_4 = tensor.insert_slice %22 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%23 = iree_vector_ext.to_layout %extracted_slice_5 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%24 = tensor.empty() : tensor<64x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %23 : f16, tensor<64x64xf16>) outs(%24 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_10: f16, %out: f16):
%31 = arith.mulf %in, %in_10 : f16
linalg.yield %31 : f16
} -> tensor<64x64xf16>
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%27 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%28:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %inserted_slice, %arg6 = %inserted_slice_3, %arg7 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_10 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%31 = iree_vector_ext.to_layout %extracted_slice_11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_12 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%32 = iree_vector_ext.to_layout %extracted_slice_13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%33 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%33, %26 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%27 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_21: f16, %out: f32):
%54 = arith.extf %in : f16 to f32
%55 = arith.extf %in_21 : f16 to f32
%56 = arith.mulf %54, %55 : f32
%57 = arith.addf %56, %out : f32
linalg.yield %57 : f32
} -> tensor<64x64xf32>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%36 = tensor.empty() : tensor<64x64xf32>
%extracted_slice_14 = tensor.extract_slice %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%extracted_slice_14 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%54 = arith.maximumf %in, %out : f32
linalg.yield %54 : f32
} -> tensor<64xf32>
%inserted_slice_15 = tensor.insert_slice %37 into %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%extracted_slice_16 = tensor.extract_slice %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%38 = tensor.empty() : tensor<64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%37, %extracted_slice_16 : tensor<64xf32>, tensor<64xf32>) outs(%38 : tensor<64xf32>) {
^bb0(%in: f32, %in_21: f32, %out: f32):
%54 = arith.subf %in_21, %in : f32
%55 = math.exp2 %54 : f32
linalg.yield %55 : f32
} -> tensor<64xf32>
%extracted_slice_17 = tensor.extract_slice %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%40 = tensor.empty() : tensor<64xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39, %extracted_slice_17 : tensor<64xf32>, tensor<64xf32>) outs(%40 : tensor<64xf32>) {
^bb0(%in: f32, %in_21: f32, %out: f32):
%54 = arith.mulf %in, %in_21 : f32
linalg.yield %54 : f32
} -> tensor<64xf32>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%37, %35 : tensor<64xf32>, tensor<64x64xf32>) outs(%36 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_21: f32, %out: f32):
%54 = arith.subf %in_21, %in : f32
%55 = math.exp2 %54 : f32
linalg.yield %55 : f32
} -> tensor<64x64xf32>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%42 : tensor<64x64xf32>) outs(%41 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%54 = arith.addf %in, %out : f32
linalg.yield %54 : f32
} -> tensor<64xf32>
%inserted_slice_18 = tensor.insert_slice %43 into %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%44 = tensor.empty() : tensor<64x64xf16>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%42 : tensor<64x64xf32>) outs(%44 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%54 = arith.truncf %in : f32 to f16
linalg.yield %54 : f16
} -> tensor<64x64xf16>
%extracted_slice_19 = tensor.extract_slice %arg5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%46 = tensor.empty() : tensor<64x64xf32>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39, %extracted_slice_19 : tensor<64xf32>, tensor<64x64xf32>) outs(%46 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_21: f32, %out: f32):
%54 = arith.mulf %in, %in_21 : f32
linalg.yield %54 : f32
} -> tensor<64x64xf32>
%48 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%49 = iree_vector_ext.to_layout %45 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%48, %49 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%50 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_21: f16, %out: f32):
%54 = arith.extf %in : f16 to f32
%55 = arith.extf %in_21 : f16 to f32
%56 = arith.mulf %54, %55 : f32
%57 = arith.addf %56, %out : f32
linalg.yield %57 : f32
} -> tensor<64x64xf32>
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%53 = tensor.empty() : tensor<1x1x64x64xf32>
%inserted_slice_20 = tensor.insert_slice %52 into %53[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
scf.yield %inserted_slice_20, %inserted_slice_15, %inserted_slice_18 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%extracted_slice_6 = tensor.extract_slice %28#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%extracted_slice_7 = tensor.extract_slice %28#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%29 = tensor.empty() : tensor<64x64xf16>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<64xf32>, tensor<64x64xf32>) outs(%29 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_10: f32, %out: f16):
%31 = arith.divf %cst_0, %in : f32
%32 = arith.mulf %31, %in_10 : f32
%33 = arith.truncf %32 : f32 to f16
linalg.yield %33 : f16
} -> tensor<64x64xf16>
%extracted_slice_8 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice_9 = tensor.insert_slice %30 into %extracted_slice_8[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice_9 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<1x1x64x64xf32>
%16 = tensor.empty() : tensor<1x1x64xf32>
%17 = tensor.empty() : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
%19 = tensor.empty() : tensor<64xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%inserted_slice_4 = tensor.insert_slice %21 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%22 = iree_vector_ext.to_layout %extracted_slice_5 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%23 = tensor.empty() : tensor<64x64xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %22 : f16, tensor<64x64xf16>) outs(%23 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_10: f16, %out: f16):
%29 = arith.mulf %in, %in_10 : f16
linalg.yield %29 : f16
} -> tensor<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%26 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%27:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %inserted_slice, %arg6 = %inserted_slice_3, %arg7 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) {
%extracted_slice_10 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%29 = iree_vector_ext.to_layout %extracted_slice_11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_12 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%30 = iree_vector_ext.to_layout %extracted_slice_13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%31 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%31, %25 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%26 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_20: f16, %out: f32):
%46 = arith.extf %in : f16 to f32
%47 = arith.extf %in_20 : f16 to f32
%48 = arith.mulf %46, %47 : f32
%49 = arith.addf %48, %out : f32
linalg.yield %49 : f32
} -> tensor<64x64xf32>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%extracted_slice_14 = tensor.extract_slice %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%33 : tensor<64x64xf32>) outs(%extracted_slice_14 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%46 = arith.maximumf %in, %out : f32
linalg.yield %46 : f32
} -> tensor<64xf32>
%inserted_slice_15 = tensor.insert_slice %34 into %arg6[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%34, %extracted_slice_14 : tensor<64xf32>, tensor<64xf32>) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %in_20: f32, %out: f32):
%46 = arith.subf %in_20, %in : f32
%47 = math.exp2 %46 : f32
linalg.yield %47 : f32
} -> tensor<64xf32>
%extracted_slice_16 = tensor.extract_slice %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35, %extracted_slice_16 : tensor<64xf32>, tensor<64xf32>) outs(%19 : tensor<64xf32>) {
^bb0(%in: f32, %in_20: f32, %out: f32):
%46 = arith.mulf %in, %in_20 : f32
linalg.yield %46 : f32
} -> tensor<64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%34, %33 : tensor<64xf32>, tensor<64x64xf32>) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_20: f32, %out: f32):
%46 = arith.subf %in_20, %in : f32
%47 = math.exp2 %46 : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%37 : tensor<64x64xf32>) outs(%36 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%46 = arith.addf %in, %out : f32
linalg.yield %46 : f32
} -> tensor<64xf32>
%inserted_slice_17 = tensor.insert_slice %38 into %arg7[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%37 : tensor<64x64xf32>) outs(%23 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%46 = arith.truncf %in : f32 to f16
linalg.yield %46 : f16
} -> tensor<64x64xf16>
%extracted_slice_18 = tensor.extract_slice %arg5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %extracted_slice_18 : tensor<64xf32>, tensor<64x64xf32>) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_20: f32, %out: f32):
%46 = arith.mulf %in, %in_20 : f32
linalg.yield %46 : f32
} -> tensor<64x64xf32>
%41 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%42 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%43 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%41, %42 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%43 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_20: f16, %out: f32):
%46 = arith.extf %in : f16 to f32
%47 = arith.extf %in_20 : f16 to f32
%48 = arith.mulf %46, %47 : f32
%49 = arith.addf %48, %out : f32
linalg.yield %49 : f32
} -> tensor<64x64xf32>
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%inserted_slice_19 = tensor.insert_slice %45 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32>
scf.yield %inserted_slice_19, %inserted_slice_15, %inserted_slice_17 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>
}
%extracted_slice_6 = tensor.extract_slice %27#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32>
%extracted_slice_7 = tensor.extract_slice %27#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<64xf32>, tensor<64x64xf32>) outs(%23 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_10: f32, %out: f16):
%29 = arith.divf %cst_0, %in : f32
%30 = arith.mulf %29, %in_10 : f32
%31 = arith.truncf %30 : f32 to f16
linalg.yield %31 : f16
} -> tensor<64x64xf16>
%extracted_slice_8 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice_9 = tensor.insert_slice %28 into %extracted_slice_8[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice_9 into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%21 = tensor.empty() : tensor<64x64xf16>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_5: f16, %out: f16):
%27 = arith.mulf %in, %in_5 : f16
linalg.yield %27 : f16
} -> tensor<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_5 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%27 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_7[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%28 = iree_vector_ext.to_layout %extracted_slice_8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg5 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.maximumf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg5 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg6 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.addf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%44 = arith.truncf %in : f32 to f16
linalg.yield %44 : f16
} -> tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg7 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64x64xf32>
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_5: f32, %out: f16):
%27 = arith.divf %cst_0, %in : f32
%28 = arith.mulf %27, %in_5 : f32
%29 = arith.truncf %28 : f32 to f16
linalg.yield %29 : f16
} -> tensor<64x64xf16>
%extracted_slice_4 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %26 into %extracted_slice_4[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%21 = tensor.empty() : tensor<64x64xf16>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_5: f16, %out: f16):
%27 = arith.mulf %in, %in_5 : f16
linalg.yield %27 : f16
} -> tensor<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_5 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%27 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_7[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%28 = iree_vector_ext.to_layout %extracted_slice_8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg5 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.maximumf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg5 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg6 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.addf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%44 = arith.truncf %in : f32 to f16
linalg.yield %44 : f16
} -> tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg7 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64x64xf32>
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_5: f32, %out: f16):
%27 = arith.divf %cst_0, %in : f32
%28 = arith.mulf %27, %in_5 : f32
%29 = arith.truncf %28 : f32 to f16
linalg.yield %29 : f16
} -> tensor<64x64xf16>
%extracted_slice_4 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %26 into %extracted_slice_4[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant -3.40282347E+38 : f32
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%21 = tensor.empty() : tensor<64x64xf16>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_5: f16, %out: f16):
%27 = arith.mulf %in, %in_5 : f16
linalg.yield %27 : f16
} -> tensor<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_5 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%27 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_7[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%28 = iree_vector_ext.to_layout %extracted_slice_8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16>
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg5 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.maximumf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg5 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg6 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.subf %in_9, %in : f32
%45 = math.exp2 %44 : f32
linalg.yield %45 : f32
} -> tensor<64x64xf32>
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%44 = arith.addf %in, %out : f32
linalg.yield %44 : f32
} -> tensor<64xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%44 = arith.truncf %in : f32 to f16
linalg.yield %44 : f16
} -> tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg7 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_9: f32, %out: f32):
%44 = arith.mulf %in, %in_9 : f32
linalg.yield %44 : f32
} -> tensor<64x64xf32>
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16>
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16>
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_9: f16, %out: f32):
%44 = arith.extf %in : f16 to f32
%45 = arith.extf %in_9 : f16 to f32
%46 = arith.mulf %44, %45 : f32
%47 = arith.addf %46, %out : f32
linalg.yield %47 : f32
} -> tensor<64x64xf32>
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_5: f32, %out: f16):
%27 = arith.divf %cst_0, %in : f32
%28 = arith.mulf %27, %in_5 : f32
%29 = arith.truncf %28 : f32 to f16
linalg.yield %29 : f16
} -> tensor<64x64xf16>
%extracted_slice_4 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %26 into %extracted_slice_4[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 1.802980e-01 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 1.000000e+00 : f32
%cst_2 = arith.constant -3.40282347E+38 : f32
%cst_3 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_3 : f32) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_3 : f32) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = vector.transfer_read %extracted_slice_4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%22 = tensor.empty() : tensor<64x64xf16>
%23 = vector.transfer_write %21, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%24 = tensor.empty() : tensor<64x64xf16>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_0, %23 : f16, tensor<64x64xf16>) outs(%24 : tensor<64x64xf16>) {
^bb0(%in: f16, %in_6: f16, %out: f16):
%36 = arith.mulf %in, %in_6 : f16
linalg.yield %36 : f16
} -> tensor<64x64xf16>
%26 = vector.transfer_read %25[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%28 = tensor.empty() : tensor<64x64xf16>
%29 = vector.transfer_write %27, %28[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%30 = vector.transfer_read %16[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%32 = tensor.empty() : tensor<64x64xf32>
%33 = vector.transfer_write %31, %32[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%34:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_6 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%36 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%37 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%extracted_slice_8 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%38 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%39 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%41 = tensor.empty() : tensor<64x64xf16>
%42 = vector.transfer_write %40, %41[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%42, %29 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%33 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_10: f16, %out: f32):
%71 = arith.extf %in : f16 to f32
%72 = arith.extf %in_10 : f16 to f32
%73 = arith.mulf %71, %72 : f32
%74 = arith.addf %73, %out : f32
linalg.yield %74 : f32
} -> tensor<64x64xf32>
%44 = vector.transfer_read %43[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%46 = tensor.empty() : tensor<64x64xf32>
%47 = vector.transfer_write %45, %46[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%47 : tensor<64x64xf32>) outs(%arg5 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%71 = arith.maximumf %in, %out : f32
linalg.yield %71 : f32
} -> tensor<64xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%48, %arg5 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_10: f32, %out: f32):
%71 = arith.subf %in_10, %in : f32
%72 = math.exp2 %71 : f32
linalg.yield %72 : f32
} -> tensor<64xf32>
%50 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%49, %arg6 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) {
^bb0(%in: f32, %in_10: f32, %out: f32):
%71 = arith.mulf %in, %in_10 : f32
linalg.yield %71 : f32
} -> tensor<64xf32>
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%48, %47 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_10: f32, %out: f32):
%71 = arith.subf %in_10, %in : f32
%72 = math.exp2 %71 : f32
linalg.yield %72 : f32
} -> tensor<64x64xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%51 : tensor<64x64xf32>) outs(%50 : tensor<64xf32>) {
^bb0(%in: f32, %out: f32):
%71 = arith.addf %in, %out : f32
linalg.yield %71 : f32
} -> tensor<64xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51 : tensor<64x64xf32>) outs(%24 : tensor<64x64xf16>) {
^bb0(%in: f32, %out: f16):
%71 = arith.truncf %in : f32 to f16
linalg.yield %71 : f16
} -> tensor<64x64xf16>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%49, %arg7 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) {
^bb0(%in: f32, %in_10: f32, %out: f32):
%71 = arith.mulf %in, %in_10 : f32
linalg.yield %71 : f32
} -> tensor<64x64xf32>
%55 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%56 = tensor.empty() : tensor<64x64xf16>
%57 = vector.transfer_write %55, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%58 = vector.transfer_read %53[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%60 = tensor.empty() : tensor<64x64xf16>
%61 = vector.transfer_write %59, %60[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%62 = vector.transfer_read %54[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%64 = tensor.empty() : tensor<64x64xf32>
%65 = vector.transfer_write %63, %64[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%57, %61 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%65 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_10: f16, %out: f32):
%71 = arith.extf %in : f16 to f32
%72 = arith.extf %in_10 : f16 to f32
%73 = arith.mulf %71, %72 : f32
%74 = arith.addf %73, %out : f32
linalg.yield %74 : f32
} -> tensor<64x64xf32>
%67 = vector.transfer_read %66[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%68 = iree_vector_ext.to_layout %67 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%69 = tensor.empty() : tensor<64x64xf32>
%70 = vector.transfer_write %68, %69[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %48, %52, %70 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%34#1, %34#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%24 : tensor<64x64xf16>) {
^bb0(%in: f32, %in_6: f32, %out: f16):
%36 = arith.divf %cst_1, %in : f32
%37 = arith.mulf %36, %in_6 : f32
%38 = arith.truncf %37 : f32 to f16
linalg.yield %38 : f16
} -> tensor<64x64xf16>
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %35 into %extracted_slice_5[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_5 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%22 = tensor.empty() : tensor<64x64xf16>
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_8 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%35 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%extracted_slice_10 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%37 = vector.transfer_read %extracted_slice_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%42 = vector.transfer_read %arg5[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32>
%44 = vector.transfer_write %43, %arg5[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%45 = vector.transfer_read %arg5[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%46 = arith.subf %45, %43 : vector<64xf32>
%47 = math.exp2 %46 : vector<64xf32>
%48 = vector.transfer_read %arg6[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%49 = arith.mulf %47, %48 : vector<64xf32>
%50 = vector.transfer_write %49, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%51 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%52 = vector.transpose %51, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%53 = arith.subf %41, %52 : vector<64x64xf32>
%54 = math.exp2 %53 : vector<64x64xf32>
%55 = vector.multi_reduction <add>, %54, %49 [1] : vector<64x64xf32> to vector<64xf32>
%56 = vector.transfer_write %55, %50[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%57 = arith.truncf %54 : vector<64x64xf32> to vector<64x64xf16>
%58 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
%59 = vector.transpose %58, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%60 = vector.transfer_read %arg7[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%61 = arith.mulf %59, %60 : vector<64x64xf32>
%62 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%63 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%64 = iree_vector_ext.to_layout %61 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %63, %64 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%66 = iree_vector_ext.to_layout %65 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%67 = tensor.empty() : tensor<64x64xf32>
%68 = vector.transfer_write %66, %67[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %44, %56, %68 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32>
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%30 = arith.divf %cst, %28 : vector<64x64xf32>
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%32 = arith.mulf %31, %29 : vector<64x64xf32>
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16>
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %34 into %extracted_slice_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_5 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%22 = tensor.empty() : tensor<64x64xf16>
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_8 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%35 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%extracted_slice_10 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%37 = vector.transfer_read %extracted_slice_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%42 = vector.transfer_read %arg5[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32>
%44 = vector.transfer_write %43, %arg5[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%45 = vector.transfer_read %arg5[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%46 = arith.subf %45, %43 : vector<64xf32>
%47 = math.exp2 %46 : vector<64xf32>
%48 = vector.transfer_read %arg6[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%49 = arith.mulf %47, %48 : vector<64xf32>
%50 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%51 = vector.transpose %50, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%52 = arith.subf %41, %51 : vector<64x64xf32>
%53 = math.exp2 %52 : vector<64x64xf32>
%54 = vector.multi_reduction <add>, %53, %49 [1] : vector<64x64xf32> to vector<64xf32>
%55 = vector.transfer_write %54, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%56 = arith.truncf %53 : vector<64x64xf32> to vector<64x64xf16>
%57 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
%58 = vector.transpose %57, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%59 = vector.transfer_read %arg7[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%60 = arith.mulf %58, %59 : vector<64x64xf32>
%61 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %56 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%63 = iree_vector_ext.to_layout %60 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %61, %62, %63 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%65 = iree_vector_ext.to_layout %64 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%66 = tensor.empty() : tensor<64x64xf32>
%67 = vector.transfer_write %65, %66[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %44, %55, %67 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32>
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%30 = arith.divf %cst, %28 : vector<64x64xf32>
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%32 = arith.mulf %31, %29 : vector<64x64xf32>
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16>
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %34 into %extracted_slice_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%cst_5 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%extracted_slice = tensor.extract_slice %13[%arg0, %arg1, %arg2, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%15 = tensor.empty() : tensor<64x64xf32>
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%17 = tensor.empty() : tensor<64xf32>
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%20 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%22 = tensor.empty() : tensor<64x64xf16>
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %18, %arg6 = %19, %arg7 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) {
%extracted_slice_8 = tensor.extract_slice %12[%arg0, %arg1, %arg4, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x4096x64xf16> to tensor<1x1x64x64xf16>
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%35 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%extracted_slice_10 = tensor.extract_slice %10[%arg0, %arg1, 0, %arg4] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x10x64x4096xf16> to tensor<1x1x64x64xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16>
%37 = vector.transfer_read %extracted_slice_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16>
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%42 = vector.transfer_read %arg5[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32>
%44 = vector.transfer_write %43, %arg5[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%45 = arith.subf %42, %43 : vector<64xf32>
%46 = math.exp2 %45 : vector<64xf32>
%47 = vector.transfer_read %arg6[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%48 = arith.mulf %46, %47 : vector<64xf32>
%49 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%51 = arith.subf %41, %50 : vector<64x64xf32>
%52 = math.exp2 %51 : vector<64x64xf32>
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32>
%54 = vector.transfer_write %53, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32>
%55 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16>
%56 = vector.broadcast %46 : vector<64xf32> to vector<64x64xf32>
%57 = vector.transpose %56, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%58 = vector.transfer_read %arg7[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%59 = arith.mulf %57, %58 : vector<64x64xf32>
%60 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%65 = vector.transfer_write %64, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %44, %54, %65 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>
}
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32>
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32>
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%30 = arith.divf %cst, %28 : vector<64x64xf32>
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%32 = arith.mulf %31, %29 : vector<64x64xf32>
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16>
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice_7 = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %34 into %extracted_slice_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%21:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
%28 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%30 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%35 = vector.multi_reduction <maximumf>, %34, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%36 = arith.subf %arg5, %35 : vector<64xf32>
%37 = math.exp2 %36 : vector<64xf32>
%38 = arith.mulf %37, %arg6 : vector<64xf32>
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32>
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%41 = arith.subf %34, %40 : vector<64x64xf32>
%42 = math.exp2 %41 : vector<64x64xf32>
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16>
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32>
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%47 = arith.mulf %46, %arg7 : vector<64x64xf32>
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32>
%23 = arith.divf %cst, %22 : vector<64x64xf32>
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%25 = arith.mulf %24, %21#2 : vector<64x64xf32>
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16>
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %27 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%21:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
%28 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%30 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%35 = vector.multi_reduction <maximumf>, %34, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%36 = arith.subf %arg5, %35 : vector<64xf32>
%37 = math.exp2 %36 : vector<64xf32>
%38 = arith.mulf %37, %arg6 : vector<64xf32>
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32>
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%41 = arith.subf %34, %40 : vector<64x64xf32>
%42 = math.exp2 %41 : vector<64x64xf32>
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16>
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32>
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%47 = arith.mulf %46, %arg7 : vector<64x64xf32>
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32>
%23 = arith.divf %cst, %22 : vector<64x64xf32>
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%25 = arith.mulf %24, %21#2 : vector<64x64xf32>
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16>
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %27 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%21:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
%28 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%30 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%35 = vector.multi_reduction <maximumf>, %34, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%36 = arith.subf %arg5, %35 : vector<64xf32>
%37 = math.exp2 %36 : vector<64xf32>
%38 = arith.mulf %37, %arg6 : vector<64xf32>
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32>
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%41 = arith.subf %34, %40 : vector<64x64xf32>
%42 = math.exp2 %41 : vector<64x64xf32>
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16>
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32>
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%47 = arith.mulf %46, %arg7 : vector<64x64xf32>
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16>
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32>
%23 = arith.divf %cst, %22 : vector<64x64xf32>
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%25 = arith.mulf %24, %21#2 : vector<64x64xf32>
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16>
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %27 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After GPUVectorAllocPass (iree-codegen-gpu-vector-alloc) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_5 = arith.constant 0 : index
%20 = vector.transfer_write %18, %19[%c0_5, %c0_5] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_6 = arith.constant 0 : index
%cst_7 = arith.constant 0.000000e+00 : f16
%22 = vector.transfer_read %21[%c0_6, %c0_6], %cst_7 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
gpu.barrier
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_8 = arith.constant 0 : index
%37 = vector.transfer_write %33, %36[%c0_8, %c0_8] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_9 = arith.constant 0 : index
%cst_10 = arith.constant 0.000000e+00 : f16
%39 = vector.transfer_read %38[%c0_9, %c0_9], %cst_10 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%43 = vector.multi_reduction <maximumf>, %42, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.subf %arg5, %43 : vector<64xf32>
%45 = math.exp2 %44 : vector<64xf32>
%46 = arith.mulf %45, %arg6 : vector<64xf32>
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%49 = arith.subf %42, %48 : vector<64x64xf32>
%50 = math.exp2 %49 : vector<64x64xf32>
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32>
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16>
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%55 = arith.mulf %54, %arg7 : vector<64x64xf32>
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_11 = arith.constant 0 : index
%57 = vector.transfer_write %35, %56[%c0_11, %c0_11] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%c0_12 = arith.constant 0 : index
%cst_13 = arith.constant 0.000000e+00 : f16
%59 = vector.transfer_read %58[%c0_12, %c0_12], %cst_13 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
%27 = arith.divf %cst, %26 : vector<64x64xf32>
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%39 = vector.transfer_read %38[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%43 = vector.multi_reduction <maximumf>, %42, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.subf %arg5, %43 : vector<64xf32>
%45 = math.exp2 %44 : vector<64xf32>
%46 = arith.mulf %45, %arg6 : vector<64xf32>
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%49 = arith.subf %42, %48 : vector<64x64xf32>
%50 = math.exp2 %49 : vector<64x64xf32>
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32>
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16>
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%55 = arith.mulf %54, %arg7 : vector<64x64xf32>
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%57 = vector.transfer_write %35, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%59 = vector.transfer_read %58[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
%27 = arith.divf %cst, %26 : vector<64x64xf32>
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%39 = vector.transfer_read %38[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%43 = vector.multi_reduction <maximumf>, %42, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%44 = arith.subf %arg5, %43 : vector<64xf32>
%45 = math.exp2 %44 : vector<64xf32>
%46 = arith.mulf %45, %arg6 : vector<64xf32>
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32>
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%49 = arith.subf %42, %48 : vector<64x64xf32>
%50 = math.exp2 %49 : vector<64x64xf32>
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32>
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16>
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%55 = arith.mulf %54, %arg7 : vector<64x64xf32>
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%57 = vector.transfer_write %35, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%59 = vector.transfer_read %58[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
%27 = arith.divf %cst, %26 : vector<64x64xf32>
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = tensor.empty() : tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = tensor.empty() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%39 = vector.transfer_write %35, %38[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%40:2 = iree_gpu.value_barrier %37, %39 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%41 = vector.transfer_read %40#0[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%45 = vector.multi_reduction <maximumf>, %44, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%46 = arith.subf %arg5, %45 : vector<64xf32>
%47 = math.exp2 %46 : vector<64xf32>
%48 = arith.mulf %47, %arg6 : vector<64xf32>
%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%51 = arith.subf %44, %50 : vector<64x64xf32>
%52 = math.exp2 %51 : vector<64x64xf32>
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32>
%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16>
%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%57 = arith.mulf %56, %arg7 : vector<64x64xf32>
%58 = vector.transfer_read %40#1[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%60 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
%27 = arith.divf %cst, %26 : vector<64x64xf32>
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<2x4096x10x64xf16>
%12 = tensor.empty() : tensor<2x4096x10x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%15 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%16 = vector.transfer_read %14[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%17 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%18 = tensor.empty() : tensor<64x64xf16>
%19 = arith.mulf %17, %cst_0 : vector<64x64xf16>
%20 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = vector.transfer_write %19, %20[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%22 = iree_gpu.value_barrier %21 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%23 = vector.transfer_read %22[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%26:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%33 = vector.transfer_read %13[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%35 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%37 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = vector.transfer_write %34, %37[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%39 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_write %36, %39[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%41:2 = iree_gpu.value_barrier %38, %40 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%42 = vector.transfer_read %41#0[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %43, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%46 = vector.multi_reduction <maximumf>, %45, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%47 = arith.subf %arg5, %46 : vector<64xf32>
%48 = math.exp2 %47 : vector<64xf32>
%49 = arith.mulf %48, %arg6 : vector<64xf32>
%50 = vector.broadcast %46 : vector<64xf32> to vector<64x64xf32>
%51 = vector.transpose %50, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%52 = arith.subf %45, %51 : vector<64x64xf32>
%53 = math.exp2 %52 : vector<64x64xf32>
%54 = vector.multi_reduction <add>, %53, %49 [1] : vector<64x64xf32> to vector<64xf32>
%55 = arith.truncf %53 : vector<64x64xf32> to vector<64x64xf16>
%56 = vector.broadcast %48 : vector<64xf32> to vector<64x64xf32>
%57 = vector.transpose %56, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%58 = arith.mulf %57, %arg7 : vector<64x64xf32>
%59 = vector.transfer_read %41#1[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%62 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %46, %54, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%27 = vector.broadcast %26#1 : vector<64xf32> to vector<64x64xf32>
%28 = arith.divf %cst, %27 : vector<64x64xf32>
%29 = vector.transpose %28, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%30 = arith.mulf %29, %26#2 : vector<64x64xf32>
%31 = arith.truncf %30 : vector<64x64xf32> to vector<64x64xf16>
%32 = vector.transfer_write %31, %18[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %32 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %15, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
%11 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<2x4096x10x64xf16>
%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
gpu.barrier
%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%17 = bufferization.alloc_tensor() : tensor<64x64xf16>
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
%39 = vector.transfer_write %35, %38[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%40:2 = iree_gpu.value_barrier %37, %39 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>>
%41 = vector.transfer_read %40#0[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%45 = vector.multi_reduction <maximumf>, %44, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
%46 = arith.subf %arg5, %45 : vector<64xf32>
%47 = math.exp2 %46 : vector<64xf32>
%48 = arith.mulf %47, %arg6 : vector<64xf32>
%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%51 = arith.subf %44, %50 : vector<64x64xf32>
%52 = math.exp2 %51 : vector<64x64xf32>
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32>
%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16>
%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%57 = arith.mulf %56, %arg7 : vector<64x64xf32>
%58 = vector.transfer_read %40#1[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%60 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%61 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
%27 = arith.divf %cst, %26 : vector<64x64xf32>
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%subview_9 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %9, %9 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%subview_9 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %9, %9 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_11[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%subview_9 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %subview_7, %subview_7 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_4 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_5[%arg0, %arg1, %arg2, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%12 = arith.mulf %11, %cst_0 : vector<64x64xf16>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_2, %arg5 = %cst_1, %arg6 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_4 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_4 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %23, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_7[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_4 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_5 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_6 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_7 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_8[%arg0, %arg1, %arg2, %c0], %cst_7 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%12 = arith.mulf %11, %cst_3 : vector<64x64xf16>
vector.transfer_write %12, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_1[%c0, %c0], %cst_7 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_5, %arg5 = %cst_4, %arg6 = %cst_6) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst_7 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst_7 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
vector.transfer_write %23, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_0[%c0, %c0], %cst_7 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc[%c0, %c0], %cst_7 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_9 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_10 = memref.subview %subview_9[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc_2, %subview_10 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_2 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_1 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_0 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUCastTypeToFitMMAPass (iree-llvmgpu-cast-type-to-fit-mma) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%c0 = arith.constant 0 : index
%c4096 = arith.constant 4096 : index
%c64 = arith.constant 64 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32>
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_5 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_6 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_7 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = vector.transfer_read %subview_8[%arg0, %arg1, %arg2, %c0], %cst {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%12 = arith.mulf %11, %cst_3 : vector<64x64xf16>
vector.transfer_write %12, %alloc_6[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc_6[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%15 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%16:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_1, %arg5 = %cst_2, %arg6 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
gpu.barrier
%22 = vector.transfer_read %subview[%arg0, %arg1, %arg3, %c0], %cst {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
%24 = vector.transfer_read %8[%arg0, %arg1, %c0, %arg3], %cst {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16>
vector.transfer_write %23, %alloc_5[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %25, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%26 = vector.transfer_read %alloc_5[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %27, %14, %15 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%30 = vector.multi_reduction <maximumf>, %29, %arg4 [1] : vector<64x64xf32> to vector<64xf32>
%31 = arith.subf %arg4, %30 : vector<64xf32>
%32 = math.exp2 %31 : vector<64xf32>
%33 = arith.mulf %32, %arg5 : vector<64xf32>
%34 = vector.broadcast %30 : vector<64xf32> to vector<64x64xf32>
%35 = vector.transpose %34, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%36 = arith.subf %29, %35 : vector<64x64xf32>
%37 = math.exp2 %36 : vector<64x64xf32>
%38 = vector.multi_reduction <add>, %37, %33 [1] : vector<64x64xf32> to vector<64xf32>
%39 = arith.truncf %37 : vector<64x64xf32> to vector<64x64xf16>
%40 = vector.broadcast %32 : vector<64xf32> to vector<64x64xf32>
%41 = vector.transpose %40, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%42 = arith.mulf %41, %arg6 : vector<64x64xf32>
%43 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%45 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
%46 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %46 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
%48 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %30, %38, %48 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
}
%17 = vector.broadcast %16#1 : vector<64xf32> to vector<64x64xf32>
%18 = arith.divf %cst_4, %17 : vector<64x64xf32>
%19 = vector.transpose %18, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
%20 = arith.mulf %19, %16#2 : vector<64x64xf32>
%21 = arith.truncf %20 : vector<64x64xf32> to vector<64x64xf16>
vector.transfer_write %21, %alloc_7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_9 = memref.subview %9[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_10 = memref.subview %subview_9[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc_7, %subview_10 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_7 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_6 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_5 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<0.000000e+00> : vector<4x2x1x1x4x1xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
%cst_1 = arith.constant dense<0xFF800000> : vector<2x1x1xf32>
%cst_2 = arith.constant dense<0.000000e+00> : vector<4x4x1x1x1x4xf16>
%cst_3 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf16>
%cst_4 = arith.constant dense<0.000000e+00> : vector<4x1x1x1x1x8xf16>
%cst_5 = arith.constant dense<1.000000e+00> : vector<4x2x1x1x4x1xf32>
%cst_6 = arith.constant dense<1.802980e-01> : vector<4x1x1x1x1x8xf16>
%cst_7 = arith.constant dense<0.000000e+00> : vector<2x1x1xf32>
%cst_8 = arith.constant dense<-3.40282347E+38> : vector<2x1x1xf32>
%cst_9 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf32>
%cst_10 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%thread_id_z = gpu.thread_id z
%thread_id_y = gpu.thread_id y
%thread_id_x = gpu.thread_id x
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 128) : index
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_13 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%3 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6 = arith.index_castui %3 : i32 to index
%7:3 = util.assume.int
%4[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%5[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%6[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%7#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%7#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%10 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%7#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %10, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %8[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_14 = memref.subview %8[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16)>()[%arg2, %0]
%12 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%13 = vector.transfer_read %subview_14[%arg0, %arg1, %11, %12], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%14 = vector.insert_strided_slice %13, %cst_4 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 16)>()[%arg2, %0]
%16 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%17 = vector.transfer_read %subview_14[%arg0, %arg1, %15, %16], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%18 = vector.insert_strided_slice %17, %14 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 32)>()[%arg2, %0]
%20 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%21 = vector.transfer_read %subview_14[%arg0, %arg1, %19, %20], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%22 = vector.insert_strided_slice %21, %18 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 48)>()[%arg2, %0]
%24 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%25 = vector.transfer_read %subview_14[%arg0, %arg1, %23, %24], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = vector.insert_strided_slice %25, %22 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%27 = arith.mulf %26, %cst_6 : vector<4x1x1x1x1x8xf16>
%28 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0]
%29 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%30 = vector.extract %27[0, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16>
vector.transfer_write %30, %alloc_12[%28, %29] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%31 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0]
%32 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%33 = vector.extract %27[1, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16>
vector.transfer_write %33, %alloc_12[%31, %32] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0]
%35 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%36 = vector.extract %27[2, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16>
vector.transfer_write %36, %alloc_12[%34, %35] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0]
%38 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%39 = vector.extract %27[3, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16>
vector.transfer_write %39, %alloc_12[%37, %38] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%42 = vector.transfer_read %alloc_12[%40, %41], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_3 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%45 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%46 = vector.transfer_read %alloc_12[%44, %45], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%47 = vector.insert_strided_slice %46, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%48 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%49 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%50 = vector.transfer_read %alloc_12[%48, %49], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%51 = vector.insert_strided_slice %50, %47 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%52 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%54 = vector.transfer_read %alloc_12[%52, %53], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %51 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%57 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%58 = vector.transfer_read %alloc_12[%56, %57], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%59 = vector.insert_strided_slice %58, %55 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%60 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%61 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%62 = vector.transfer_read %alloc_12[%60, %61], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%63 = vector.insert_strided_slice %62, %59 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%64 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%65 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%66 = vector.transfer_read %alloc_12[%64, %65], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %63 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%68 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%69 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%70 = vector.transfer_read %alloc_12[%68, %69], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %67 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16>
%72:3 = scf.for %arg3 = %c0 to %c4096 step %c64 iter_args(%arg4 = %cst_8, %arg5 = %cst_7, %arg6 = %cst_9) -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32>) {
gpu.barrier
%125 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16)>()[%arg3, %0]
%126 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%127 = vector.transfer_read %subview[%arg0, %arg1, %125, %126], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%128 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 16)>()[%arg3, %0]
%129 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%130 = vector.transfer_read %subview[%arg0, %arg1, %128, %129], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%131 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 32)>()[%arg3, %0]
%132 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%133 = vector.transfer_read %subview[%arg0, %arg1, %131, %132], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%134 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 48)>()[%arg3, %0]
%135 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
%136 = vector.transfer_read %subview[%arg0, %arg1, %134, %135], %cst_10 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%137 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0]
%138 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg3, %0]
%139 = vector.transfer_read %9[%arg0, %arg1, %137, %138], %cst_10 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%140 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0]
%141 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg3, %0]
%142 = vector.transfer_read %9[%arg0, %arg1, %140, %141], %cst_10 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%143 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0]
%144 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg3, %0]
%145 = vector.transfer_read %9[%arg0, %arg1, %143, %144], %cst_10 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%146 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0]
%147 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg3, %0]
%148 = vector.transfer_read %9[%arg0, %arg1, %146, %147], %cst_10 {in_bounds = [true, true]} : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%149 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0]
%150 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %127, %alloc_11[%149, %150] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%151 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0]
%152 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %130, %alloc_11[%151, %152] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%153 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0]
%154 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %133, %alloc_11[%153, %154] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%155 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0]
%156 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %136, %alloc_11[%155, %156] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%157 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0]
%158 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %139, %alloc[%157, %158] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%159 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0]
%160 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %142, %alloc[%159, %160] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%161 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0]
%162 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %145, %alloc[%161, %162] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%163 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0]
%164 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %148, %alloc[%163, %164] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%165 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%166 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%167 = vector.transfer_read %alloc_11[%165, %166], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%168 = vector.insert_strided_slice %167, %cst_2 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%169 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%170 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%171 = vector.transfer_read %alloc_11[%169, %170], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%172 = vector.insert_strided_slice %171, %168 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%173 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%174 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%175 = vector.transfer_read %alloc_11[%173, %174], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%176 = vector.insert_strided_slice %175, %172 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%177 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%178 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%179 = vector.transfer_read %alloc_11[%177, %178], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%180 = vector.insert_strided_slice %179, %176 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%181 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%182 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%183 = vector.transfer_read %alloc_11[%181, %182], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%184 = vector.insert_strided_slice %183, %180 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%185 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%186 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%187 = vector.transfer_read %alloc_11[%185, %186], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%188 = vector.insert_strided_slice %187, %184 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%189 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%190 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%191 = vector.transfer_read %alloc_11[%189, %190], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%192 = vector.insert_strided_slice %191, %188 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%193 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%194 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%195 = vector.transfer_read %alloc_11[%193, %194], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%196 = vector.insert_strided_slice %195, %192 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%197 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%198 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%199 = vector.transfer_read %alloc_11[%197, %198], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%200 = vector.insert_strided_slice %199, %196 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%201 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%202 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%203 = vector.transfer_read %alloc_11[%201, %202], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%204 = vector.insert_strided_slice %203, %200 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%205 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%206 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%207 = vector.transfer_read %alloc_11[%205, %206], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%208 = vector.insert_strided_slice %207, %204 {offsets = [2, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%209 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%210 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%211 = vector.transfer_read %alloc_11[%209, %210], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%212 = vector.insert_strided_slice %211, %208 {offsets = [2, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%213 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%214 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%215 = vector.transfer_read %alloc_11[%213, %214], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%216 = vector.insert_strided_slice %215, %212 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%217 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%218 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%219 = vector.transfer_read %alloc_11[%217, %218], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%220 = vector.insert_strided_slice %219, %216 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%221 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%222 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%223 = vector.transfer_read %alloc_11[%221, %222], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%224 = vector.insert_strided_slice %223, %220 {offsets = [3, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%225 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%226 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%227 = vector.transfer_read %alloc_11[%225, %226], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%228 = vector.insert_strided_slice %227, %224 {offsets = [3, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%229 = vector.extract %cst_9[0, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%230 = vector.extract %228[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%231 = vector.extract %71[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%232 = vector.shape_cast %230 : vector<1x1x1x4xf16> to vector<4xf16>
%233 = vector.shape_cast %231 : vector<1x1x1x4xf16> to vector<4xf16>
%234 = vector.shape_cast %229 : vector<1x1x1x4xf32> to vector<4xf32>
%235 = amdgpu.mfma %232 * %233 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%236 = vector.extract %228[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%237 = vector.extract %71[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%238 = vector.shape_cast %236 : vector<1x1x1x4xf16> to vector<4xf16>
%239 = vector.shape_cast %237 : vector<1x1x1x4xf16> to vector<4xf16>
%240 = amdgpu.mfma %238 * %239 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%241 = vector.extract %228[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%242 = vector.extract %71[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%243 = vector.shape_cast %241 : vector<1x1x1x4xf16> to vector<4xf16>
%244 = vector.shape_cast %242 : vector<1x1x1x4xf16> to vector<4xf16>
%245 = amdgpu.mfma %243 * %244 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%246 = vector.extract %228[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%247 = vector.extract %71[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%248 = vector.shape_cast %246 : vector<1x1x1x4xf16> to vector<4xf16>
%249 = vector.shape_cast %247 : vector<1x1x1x4xf16> to vector<4xf16>
%250 = amdgpu.mfma %248 * %249 + %245 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%251 = vector.shape_cast %250 : vector<4xf32> to vector<1x1x1x4xf32>
%252 = vector.insert %251, %cst_9 [0, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%253 = vector.extract %cst_9[0, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%254 = vector.extract %228[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%255 = vector.extract %71[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%256 = vector.shape_cast %254 : vector<1x1x1x4xf16> to vector<4xf16>
%257 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16>
%258 = vector.shape_cast %253 : vector<1x1x1x4xf32> to vector<4xf32>
%259 = amdgpu.mfma %256 * %257 + %258 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%260 = vector.extract %228[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%261 = vector.extract %71[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%262 = vector.shape_cast %260 : vector<1x1x1x4xf16> to vector<4xf16>
%263 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16>
%264 = amdgpu.mfma %262 * %263 + %259 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%265 = vector.extract %228[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%266 = vector.extract %71[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%267 = vector.shape_cast %265 : vector<1x1x1x4xf16> to vector<4xf16>
%268 = vector.shape_cast %266 : vector<1x1x1x4xf16> to vector<4xf16>
%269 = amdgpu.mfma %267 * %268 + %264 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%270 = vector.extract %228[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%271 = vector.extract %71[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%272 = vector.shape_cast %270 : vector<1x1x1x4xf16> to vector<4xf16>
%273 = vector.shape_cast %271 : vector<1x1x1x4xf16> to vector<4xf16>
%274 = amdgpu.mfma %272 * %273 + %269 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%275 = vector.shape_cast %274 : vector<4xf32> to vector<1x1x1x4xf32>
%276 = vector.insert %275, %252 [0, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%277 = vector.extract %cst_9[0, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%278 = vector.extract %228[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%279 = vector.extract %71[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%280 = vector.shape_cast %278 : vector<1x1x1x4xf16> to vector<4xf16>
%281 = vector.shape_cast %279 : vector<1x1x1x4xf16> to vector<4xf16>
%282 = vector.shape_cast %277 : vector<1x1x1x4xf32> to vector<4xf32>
%283 = amdgpu.mfma %280 * %281 + %282 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%284 = vector.extract %228[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%285 = vector.extract %71[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%286 = vector.shape_cast %284 : vector<1x1x1x4xf16> to vector<4xf16>
%287 = vector.shape_cast %285 : vector<1x1x1x4xf16> to vector<4xf16>
%288 = amdgpu.mfma %286 * %287 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%289 = vector.extract %228[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%290 = vector.extract %71[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%291 = vector.shape_cast %289 : vector<1x1x1x4xf16> to vector<4xf16>
%292 = vector.shape_cast %290 : vector<1x1x1x4xf16> to vector<4xf16>
%293 = amdgpu.mfma %291 * %292 + %288 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%294 = vector.extract %228[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%295 = vector.extract %71[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%296 = vector.shape_cast %294 : vector<1x1x1x4xf16> to vector<4xf16>
%297 = vector.shape_cast %295 : vector<1x1x1x4xf16> to vector<4xf16>
%298 = amdgpu.mfma %296 * %297 + %293 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%299 = vector.shape_cast %298 : vector<4xf32> to vector<1x1x1x4xf32>
%300 = vector.insert %299, %276 [0, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%301 = vector.extract %cst_9[0, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%302 = vector.extract %228[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%303 = vector.extract %71[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%304 = vector.shape_cast %302 : vector<1x1x1x4xf16> to vector<4xf16>
%305 = vector.shape_cast %303 : vector<1x1x1x4xf16> to vector<4xf16>
%306 = vector.shape_cast %301 : vector<1x1x1x4xf32> to vector<4xf32>
%307 = amdgpu.mfma %304 * %305 + %306 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%308 = vector.extract %228[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%309 = vector.extract %71[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%310 = vector.shape_cast %308 : vector<1x1x1x4xf16> to vector<4xf16>
%311 = vector.shape_cast %309 : vector<1x1x1x4xf16> to vector<4xf16>
%312 = amdgpu.mfma %310 * %311 + %307 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%313 = vector.extract %228[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%314 = vector.extract %71[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%315 = vector.shape_cast %313 : vector<1x1x1x4xf16> to vector<4xf16>
%316 = vector.shape_cast %314 : vector<1x1x1x4xf16> to vector<4xf16>
%317 = amdgpu.mfma %315 * %316 + %312 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%318 = vector.extract %228[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%319 = vector.extract %71[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%320 = vector.shape_cast %318 : vector<1x1x1x4xf16> to vector<4xf16>
%321 = vector.shape_cast %319 : vector<1x1x1x4xf16> to vector<4xf16>
%322 = amdgpu.mfma %320 * %321 + %317 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%323 = vector.shape_cast %322 : vector<4xf32> to vector<1x1x1x4xf32>
%324 = vector.insert %323, %300 [0, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%325 = vector.extract %cst_9[1, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%326 = vector.extract %228[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%327 = vector.extract %71[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%328 = vector.shape_cast %326 : vector<1x1x1x4xf16> to vector<4xf16>
%329 = vector.shape_cast %327 : vector<1x1x1x4xf16> to vector<4xf16>
%330 = vector.shape_cast %325 : vector<1x1x1x4xf32> to vector<4xf32>
%331 = amdgpu.mfma %328 * %329 + %330 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%332 = vector.extract %228[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%333 = vector.extract %71[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%334 = vector.shape_cast %332 : vector<1x1x1x4xf16> to vector<4xf16>
%335 = vector.shape_cast %333 : vector<1x1x1x4xf16> to vector<4xf16>
%336 = amdgpu.mfma %334 * %335 + %331 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%337 = vector.extract %228[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%338 = vector.extract %71[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%339 = vector.shape_cast %337 : vector<1x1x1x4xf16> to vector<4xf16>
%340 = vector.shape_cast %338 : vector<1x1x1x4xf16> to vector<4xf16>
%341 = amdgpu.mfma %339 * %340 + %336 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%342 = vector.extract %228[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%343 = vector.extract %71[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%344 = vector.shape_cast %342 : vector<1x1x1x4xf16> to vector<4xf16>
%345 = vector.shape_cast %343 : vector<1x1x1x4xf16> to vector<4xf16>
%346 = amdgpu.mfma %344 * %345 + %341 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%347 = vector.shape_cast %346 : vector<4xf32> to vector<1x1x1x4xf32>
%348 = vector.insert %347, %324 [1, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%349 = vector.extract %cst_9[1, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%350 = vector.extract %228[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%351 = vector.extract %71[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%352 = vector.shape_cast %350 : vector<1x1x1x4xf16> to vector<4xf16>
%353 = vector.shape_cast %351 : vector<1x1x1x4xf16> to vector<4xf16>
%354 = vector.shape_cast %349 : vector<1x1x1x4xf32> to vector<4xf32>
%355 = amdgpu.mfma %352 * %353 + %354 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%356 = vector.extract %228[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%357 = vector.extract %71[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%358 = vector.shape_cast %356 : vector<1x1x1x4xf16> to vector<4xf16>
%359 = vector.shape_cast %357 : vector<1x1x1x4xf16> to vector<4xf16>
%360 = amdgpu.mfma %358 * %359 + %355 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%361 = vector.extract %228[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%362 = vector.extract %71[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%363 = vector.shape_cast %361 : vector<1x1x1x4xf16> to vector<4xf16>
%364 = vector.shape_cast %362 : vector<1x1x1x4xf16> to vector<4xf16>
%365 = amdgpu.mfma %363 * %364 + %360 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%366 = vector.extract %228[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%367 = vector.extract %71[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%368 = vector.shape_cast %366 : vector<1x1x1x4xf16> to vector<4xf16>
%369 = vector.shape_cast %367 : vector<1x1x1x4xf16> to vector<4xf16>
%370 = amdgpu.mfma %368 * %369 + %365 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%371 = vector.shape_cast %370 : vector<4xf32> to vector<1x1x1x4xf32>
%372 = vector.insert %371, %348 [1, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%373 = vector.extract %cst_9[1, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%374 = vector.extract %228[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%375 = vector.extract %71[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%376 = vector.shape_cast %374 : vector<1x1x1x4xf16> to vector<4xf16>
%377 = vector.shape_cast %375 : vector<1x1x1x4xf16> to vector<4xf16>
%378 = vector.shape_cast %373 : vector<1x1x1x4xf32> to vector<4xf32>
%379 = amdgpu.mfma %376 * %377 + %378 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%380 = vector.extract %228[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%381 = vector.extract %71[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%382 = vector.shape_cast %380 : vector<1x1x1x4xf16> to vector<4xf16>
%383 = vector.shape_cast %381 : vector<1x1x1x4xf16> to vector<4xf16>
%384 = amdgpu.mfma %382 * %383 + %379 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%385 = vector.extract %228[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%386 = vector.extract %71[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%387 = vector.shape_cast %385 : vector<1x1x1x4xf16> to vector<4xf16>
%388 = vector.shape_cast %386 : vector<1x1x1x4xf16> to vector<4xf16>
%389 = amdgpu.mfma %387 * %388 + %384 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%390 = vector.extract %228[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%391 = vector.extract %71[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%392 = vector.shape_cast %390 : vector<1x1x1x4xf16> to vector<4xf16>
%393 = vector.shape_cast %391 : vector<1x1x1x4xf16> to vector<4xf16>
%394 = amdgpu.mfma %392 * %393 + %389 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%395 = vector.shape_cast %394 : vector<4xf32> to vector<1x1x1x4xf32>
%396 = vector.insert %395, %372 [1, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%397 = vector.extract %cst_9[1, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%398 = vector.extract %228[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%399 = vector.extract %71[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%400 = vector.shape_cast %398 : vector<1x1x1x4xf16> to vector<4xf16>
%401 = vector.shape_cast %399 : vector<1x1x1x4xf16> to vector<4xf16>
%402 = vector.shape_cast %397 : vector<1x1x1x4xf32> to vector<4xf32>
%403 = amdgpu.mfma %400 * %401 + %402 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%404 = vector.extract %228[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%405 = vector.extract %71[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%406 = vector.shape_cast %404 : vector<1x1x1x4xf16> to vector<4xf16>
%407 = vector.shape_cast %405 : vector<1x1x1x4xf16> to vector<4xf16>
%408 = amdgpu.mfma %406 * %407 + %403 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%409 = vector.extract %228[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%410 = vector.extract %71[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%411 = vector.shape_cast %409 : vector<1x1x1x4xf16> to vector<4xf16>
%412 = vector.shape_cast %410 : vector<1x1x1x4xf16> to vector<4xf16>
%413 = amdgpu.mfma %411 * %412 + %408 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%414 = vector.extract %228[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%415 = vector.extract %71[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%416 = vector.shape_cast %414 : vector<1x1x1x4xf16> to vector<4xf16>
%417 = vector.shape_cast %415 : vector<1x1x1x4xf16> to vector<4xf16>
%418 = amdgpu.mfma %416 * %417 + %413 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%419 = vector.shape_cast %418 : vector<4xf32> to vector<1x1x1x4xf32>
%420 = vector.insert %419, %396 [1, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%421 = vector.multi_reduction <maximumf>, %420, %cst_1 [1, 3, 5] : vector<2x4x1x1x1x4xf32> to vector<2x1x1xf32>
%422 = vector.extract %421[0, 0, 0] : f32 from vector<2x1x1xf32>
%423 = gpu.subgroup_reduce maximumf %422 cluster(size = 4, stride = 16) : (f32) -> f32
%424 = vector.insert %423, %cst_0 [0] : f32 into vector<2xf32>
%425 = vector.extract %421[1, 0, 0] : f32 from vector<2x1x1xf32>
%426 = gpu.subgroup_reduce maximumf %425 cluster(size = 4, stride = 16) : (f32) -> f32
%427 = vector.insert %426, %424 [1] : f32 into vector<2xf32>
%428 = vector.shape_cast %427 : vector<2xf32> to vector<2x1x1xf32>
%429 = arith.maximumf %428, %arg4 : vector<2x1x1xf32>
%430 = arith.subf %arg4, %429 : vector<2x1x1xf32>
%431 = math.exp2 %430 : vector<2x1x1xf32>
%432 = arith.mulf %431, %arg5 : vector<2x1x1xf32>
%433 = vector.extract %429[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%434 = vector.broadcast %433 : vector<1xf32> to vector<4x1xf32>
%435 = vector.insert %434, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%436 = vector.extract %429[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%437 = vector.broadcast %436 : vector<1xf32> to vector<4x1xf32>
%438 = vector.insert %437, %435 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%439 = vector.extract %429[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%440 = vector.broadcast %439 : vector<1xf32> to vector<4x1xf32>
%441 = vector.insert %440, %438 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%442 = vector.extract %429[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%443 = vector.broadcast %442 : vector<1xf32> to vector<4x1xf32>
%444 = vector.insert %443, %441 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%445 = vector.extract %429[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%446 = vector.broadcast %445 : vector<1xf32> to vector<4x1xf32>
%447 = vector.insert %446, %444 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%448 = vector.extract %429[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%449 = vector.broadcast %448 : vector<1xf32> to vector<4x1xf32>
%450 = vector.insert %449, %447 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%451 = vector.extract %429[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%452 = vector.broadcast %451 : vector<1xf32> to vector<4x1xf32>
%453 = vector.insert %452, %450 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%454 = vector.extract %429[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%455 = vector.broadcast %454 : vector<1xf32> to vector<4x1xf32>
%456 = vector.insert %455, %453 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%457 = vector.transpose %456, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32>
%458 = arith.subf %420, %457 : vector<2x4x1x1x1x4xf32>
%459 = math.exp2 %458 : vector<2x4x1x1x1x4xf32>
%460 = vector.multi_reduction <add>, %459, %cst_7 [1, 3, 5] : vector<2x4x1x1x1x4xf32> to vector<2x1x1xf32>
%461 = vector.extract %460[0, 0, 0] : f32 from vector<2x1x1xf32>
%462 = gpu.subgroup_reduce add %461 cluster(size = 4, stride = 16) : (f32) -> f32
%463 = vector.insert %462, %cst_0 [0] : f32 into vector<2xf32>
%464 = vector.extract %460[1, 0, 0] : f32 from vector<2x1x1xf32>
%465 = gpu.subgroup_reduce add %464 cluster(size = 4, stride = 16) : (f32) -> f32
%466 = vector.insert %465, %463 [1] : f32 into vector<2xf32>
%467 = vector.shape_cast %466 : vector<2xf32> to vector<2x1x1xf32>
%468 = arith.addf %467, %432 : vector<2x1x1xf32>
%469 = arith.truncf %459 : vector<2x4x1x1x1x4xf32> to vector<2x4x1x1x1x4xf16>
%470 = vector.extract %431[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%471 = vector.broadcast %470 : vector<1xf32> to vector<4x1xf32>
%472 = vector.insert %471, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%473 = vector.extract %431[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%474 = vector.broadcast %473 : vector<1xf32> to vector<4x1xf32>
%475 = vector.insert %474, %472 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%476 = vector.extract %431[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%477 = vector.broadcast %476 : vector<1xf32> to vector<4x1xf32>
%478 = vector.insert %477, %475 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%479 = vector.extract %431[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%480 = vector.broadcast %479 : vector<1xf32> to vector<4x1xf32>
%481 = vector.insert %480, %478 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%482 = vector.extract %431[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%483 = vector.broadcast %482 : vector<1xf32> to vector<4x1xf32>
%484 = vector.insert %483, %481 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%485 = vector.extract %431[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%486 = vector.broadcast %485 : vector<1xf32> to vector<4x1xf32>
%487 = vector.insert %486, %484 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%488 = vector.extract %431[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%489 = vector.broadcast %488 : vector<1xf32> to vector<4x1xf32>
%490 = vector.insert %489, %487 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%491 = vector.extract %431[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%492 = vector.broadcast %491 : vector<1xf32> to vector<4x1xf32>
%493 = vector.insert %492, %490 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%494 = vector.transpose %493, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32>
%495 = arith.mulf %494, %arg6 : vector<2x4x1x1x1x4xf32>
%496 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%497 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%498 = vector.transfer_read %alloc[%496, %497], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%499 = vector.insert_strided_slice %498, %cst_2 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%500 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%501 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%502 = vector.transfer_read %alloc[%500, %501], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%503 = vector.insert_strided_slice %502, %499 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%504 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%505 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%506 = vector.transfer_read %alloc[%504, %505], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%507 = vector.insert_strided_slice %506, %503 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%508 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0]
%509 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%510 = vector.transfer_read %alloc[%508, %509], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%511 = vector.insert_strided_slice %510, %507 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%512 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%513 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%514 = vector.transfer_read %alloc[%512, %513], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%515 = vector.insert_strided_slice %514, %511 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%516 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%517 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%518 = vector.transfer_read %alloc[%516, %517], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%519 = vector.insert_strided_slice %518, %515 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%520 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%521 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%522 = vector.transfer_read %alloc[%520, %521], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%523 = vector.insert_strided_slice %522, %519 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%524 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0]
%525 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%526 = vector.transfer_read %alloc[%524, %525], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%527 = vector.insert_strided_slice %526, %523 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%528 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%529 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%530 = vector.transfer_read %alloc[%528, %529], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%531 = vector.insert_strided_slice %530, %527 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%532 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%533 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%534 = vector.transfer_read %alloc[%532, %533], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%535 = vector.insert_strided_slice %534, %531 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%536 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%537 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%538 = vector.transfer_read %alloc[%536, %537], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%539 = vector.insert_strided_slice %538, %535 {offsets = [2, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%540 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0]
%541 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%542 = vector.transfer_read %alloc[%540, %541], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%543 = vector.insert_strided_slice %542, %539 {offsets = [2, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%544 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%545 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%546 = vector.transfer_read %alloc[%544, %545], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%547 = vector.insert_strided_slice %546, %543 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%548 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%549 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%550 = vector.transfer_read %alloc[%548, %549], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%551 = vector.insert_strided_slice %550, %547 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%552 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%553 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%554 = vector.transfer_read %alloc[%552, %553], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%555 = vector.insert_strided_slice %554, %551 {offsets = [3, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%556 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0]
%557 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%558 = vector.transfer_read %alloc[%556, %557], %cst_10 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%559 = vector.insert_strided_slice %558, %555 {offsets = [3, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16>
%560 = vector.extract %495[0, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%561 = vector.extract %559[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%562 = vector.extract %469[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%563 = vector.shape_cast %561 : vector<1x1x1x4xf16> to vector<4xf16>
%564 = vector.shape_cast %562 : vector<1x1x1x4xf16> to vector<4xf16>
%565 = vector.shape_cast %560 : vector<1x1x1x4xf32> to vector<4xf32>
%566 = amdgpu.mfma %563 * %564 + %565 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%567 = vector.extract %559[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%568 = vector.extract %469[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%569 = vector.shape_cast %567 : vector<1x1x1x4xf16> to vector<4xf16>
%570 = vector.shape_cast %568 : vector<1x1x1x4xf16> to vector<4xf16>
%571 = amdgpu.mfma %569 * %570 + %566 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%572 = vector.extract %559[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%573 = vector.extract %469[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%574 = vector.shape_cast %572 : vector<1x1x1x4xf16> to vector<4xf16>
%575 = vector.shape_cast %573 : vector<1x1x1x4xf16> to vector<4xf16>
%576 = amdgpu.mfma %574 * %575 + %571 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%577 = vector.extract %559[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%578 = vector.extract %469[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%579 = vector.shape_cast %577 : vector<1x1x1x4xf16> to vector<4xf16>
%580 = vector.shape_cast %578 : vector<1x1x1x4xf16> to vector<4xf16>
%581 = amdgpu.mfma %579 * %580 + %576 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%582 = vector.shape_cast %581 : vector<4xf32> to vector<1x1x1x4xf32>
%583 = vector.insert %582, %cst_9 [0, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%584 = vector.extract %495[0, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%585 = vector.extract %559[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%586 = vector.extract %469[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%587 = vector.shape_cast %585 : vector<1x1x1x4xf16> to vector<4xf16>
%588 = vector.shape_cast %586 : vector<1x1x1x4xf16> to vector<4xf16>
%589 = vector.shape_cast %584 : vector<1x1x1x4xf32> to vector<4xf32>
%590 = amdgpu.mfma %587 * %588 + %589 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%591 = vector.extract %559[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%592 = vector.extract %469[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%593 = vector.shape_cast %591 : vector<1x1x1x4xf16> to vector<4xf16>
%594 = vector.shape_cast %592 : vector<1x1x1x4xf16> to vector<4xf16>
%595 = amdgpu.mfma %593 * %594 + %590 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%596 = vector.extract %559[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%597 = vector.extract %469[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%598 = vector.shape_cast %596 : vector<1x1x1x4xf16> to vector<4xf16>
%599 = vector.shape_cast %597 : vector<1x1x1x4xf16> to vector<4xf16>
%600 = amdgpu.mfma %598 * %599 + %595 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%601 = vector.extract %559[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%602 = vector.extract %469[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%603 = vector.shape_cast %601 : vector<1x1x1x4xf16> to vector<4xf16>
%604 = vector.shape_cast %602 : vector<1x1x1x4xf16> to vector<4xf16>
%605 = amdgpu.mfma %603 * %604 + %600 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%606 = vector.shape_cast %605 : vector<4xf32> to vector<1x1x1x4xf32>
%607 = vector.insert %606, %583 [0, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%608 = vector.extract %495[0, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%609 = vector.extract %559[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%610 = vector.extract %469[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%611 = vector.shape_cast %609 : vector<1x1x1x4xf16> to vector<4xf16>
%612 = vector.shape_cast %610 : vector<1x1x1x4xf16> to vector<4xf16>
%613 = vector.shape_cast %608 : vector<1x1x1x4xf32> to vector<4xf32>
%614 = amdgpu.mfma %611 * %612 + %613 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%615 = vector.extract %559[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%616 = vector.extract %469[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%617 = vector.shape_cast %615 : vector<1x1x1x4xf16> to vector<4xf16>
%618 = vector.shape_cast %616 : vector<1x1x1x4xf16> to vector<4xf16>
%619 = amdgpu.mfma %617 * %618 + %614 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%620 = vector.extract %559[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%621 = vector.extract %469[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%622 = vector.shape_cast %620 : vector<1x1x1x4xf16> to vector<4xf16>
%623 = vector.shape_cast %621 : vector<1x1x1x4xf16> to vector<4xf16>
%624 = amdgpu.mfma %622 * %623 + %619 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%625 = vector.extract %559[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%626 = vector.extract %469[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%627 = vector.shape_cast %625 : vector<1x1x1x4xf16> to vector<4xf16>
%628 = vector.shape_cast %626 : vector<1x1x1x4xf16> to vector<4xf16>
%629 = amdgpu.mfma %627 * %628 + %624 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%630 = vector.shape_cast %629 : vector<4xf32> to vector<1x1x1x4xf32>
%631 = vector.insert %630, %607 [0, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%632 = vector.extract %495[0, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%633 = vector.extract %559[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%634 = vector.extract %469[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%635 = vector.shape_cast %633 : vector<1x1x1x4xf16> to vector<4xf16>
%636 = vector.shape_cast %634 : vector<1x1x1x4xf16> to vector<4xf16>
%637 = vector.shape_cast %632 : vector<1x1x1x4xf32> to vector<4xf32>
%638 = amdgpu.mfma %635 * %636 + %637 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%639 = vector.extract %559[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%640 = vector.extract %469[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%641 = vector.shape_cast %639 : vector<1x1x1x4xf16> to vector<4xf16>
%642 = vector.shape_cast %640 : vector<1x1x1x4xf16> to vector<4xf16>
%643 = amdgpu.mfma %641 * %642 + %638 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%644 = vector.extract %559[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%645 = vector.extract %469[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%646 = vector.shape_cast %644 : vector<1x1x1x4xf16> to vector<4xf16>
%647 = vector.shape_cast %645 : vector<1x1x1x4xf16> to vector<4xf16>
%648 = amdgpu.mfma %646 * %647 + %643 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%649 = vector.extract %559[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%650 = vector.extract %469[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%651 = vector.shape_cast %649 : vector<1x1x1x4xf16> to vector<4xf16>
%652 = vector.shape_cast %650 : vector<1x1x1x4xf16> to vector<4xf16>
%653 = amdgpu.mfma %651 * %652 + %648 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%654 = vector.shape_cast %653 : vector<4xf32> to vector<1x1x1x4xf32>
%655 = vector.insert %654, %631 [0, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%656 = vector.extract %495[1, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%657 = vector.extract %559[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%658 = vector.extract %469[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%659 = vector.shape_cast %657 : vector<1x1x1x4xf16> to vector<4xf16>
%660 = vector.shape_cast %658 : vector<1x1x1x4xf16> to vector<4xf16>
%661 = vector.shape_cast %656 : vector<1x1x1x4xf32> to vector<4xf32>
%662 = amdgpu.mfma %659 * %660 + %661 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%663 = vector.extract %559[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%664 = vector.extract %469[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%665 = vector.shape_cast %663 : vector<1x1x1x4xf16> to vector<4xf16>
%666 = vector.shape_cast %664 : vector<1x1x1x4xf16> to vector<4xf16>
%667 = amdgpu.mfma %665 * %666 + %662 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%668 = vector.extract %559[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%669 = vector.extract %469[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%670 = vector.shape_cast %668 : vector<1x1x1x4xf16> to vector<4xf16>
%671 = vector.shape_cast %669 : vector<1x1x1x4xf16> to vector<4xf16>
%672 = amdgpu.mfma %670 * %671 + %667 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%673 = vector.extract %559[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%674 = vector.extract %469[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%675 = vector.shape_cast %673 : vector<1x1x1x4xf16> to vector<4xf16>
%676 = vector.shape_cast %674 : vector<1x1x1x4xf16> to vector<4xf16>
%677 = amdgpu.mfma %675 * %676 + %672 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%678 = vector.shape_cast %677 : vector<4xf32> to vector<1x1x1x4xf32>
%679 = vector.insert %678, %655 [1, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%680 = vector.extract %495[1, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%681 = vector.extract %559[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%682 = vector.extract %469[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%683 = vector.shape_cast %681 : vector<1x1x1x4xf16> to vector<4xf16>
%684 = vector.shape_cast %682 : vector<1x1x1x4xf16> to vector<4xf16>
%685 = vector.shape_cast %680 : vector<1x1x1x4xf32> to vector<4xf32>
%686 = amdgpu.mfma %683 * %684 + %685 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%687 = vector.extract %559[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%688 = vector.extract %469[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%689 = vector.shape_cast %687 : vector<1x1x1x4xf16> to vector<4xf16>
%690 = vector.shape_cast %688 : vector<1x1x1x4xf16> to vector<4xf16>
%691 = amdgpu.mfma %689 * %690 + %686 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%692 = vector.extract %559[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%693 = vector.extract %469[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%694 = vector.shape_cast %692 : vector<1x1x1x4xf16> to vector<4xf16>
%695 = vector.shape_cast %693 : vector<1x1x1x4xf16> to vector<4xf16>
%696 = amdgpu.mfma %694 * %695 + %691 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%697 = vector.extract %559[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%698 = vector.extract %469[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%699 = vector.shape_cast %697 : vector<1x1x1x4xf16> to vector<4xf16>
%700 = vector.shape_cast %698 : vector<1x1x1x4xf16> to vector<4xf16>
%701 = amdgpu.mfma %699 * %700 + %696 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%702 = vector.shape_cast %701 : vector<4xf32> to vector<1x1x1x4xf32>
%703 = vector.insert %702, %679 [1, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%704 = vector.extract %495[1, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%705 = vector.extract %559[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%706 = vector.extract %469[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%707 = vector.shape_cast %705 : vector<1x1x1x4xf16> to vector<4xf16>
%708 = vector.shape_cast %706 : vector<1x1x1x4xf16> to vector<4xf16>
%709 = vector.shape_cast %704 : vector<1x1x1x4xf32> to vector<4xf32>
%710 = amdgpu.mfma %707 * %708 + %709 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%711 = vector.extract %559[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%712 = vector.extract %469[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%713 = vector.shape_cast %711 : vector<1x1x1x4xf16> to vector<4xf16>
%714 = vector.shape_cast %712 : vector<1x1x1x4xf16> to vector<4xf16>
%715 = amdgpu.mfma %713 * %714 + %710 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%716 = vector.extract %559[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%717 = vector.extract %469[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%718 = vector.shape_cast %716 : vector<1x1x1x4xf16> to vector<4xf16>
%719 = vector.shape_cast %717 : vector<1x1x1x4xf16> to vector<4xf16>
%720 = amdgpu.mfma %718 * %719 + %715 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%721 = vector.extract %559[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%722 = vector.extract %469[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%723 = vector.shape_cast %721 : vector<1x1x1x4xf16> to vector<4xf16>
%724 = vector.shape_cast %722 : vector<1x1x1x4xf16> to vector<4xf16>
%725 = amdgpu.mfma %723 * %724 + %720 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%726 = vector.shape_cast %725 : vector<4xf32> to vector<1x1x1x4xf32>
%727 = vector.insert %726, %703 [1, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
%728 = vector.extract %495[1, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32>
%729 = vector.extract %559[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%730 = vector.extract %469[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%731 = vector.shape_cast %729 : vector<1x1x1x4xf16> to vector<4xf16>
%732 = vector.shape_cast %730 : vector<1x1x1x4xf16> to vector<4xf16>
%733 = vector.shape_cast %728 : vector<1x1x1x4xf32> to vector<4xf32>
%734 = amdgpu.mfma %731 * %732 + %733 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%735 = vector.extract %559[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%736 = vector.extract %469[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%737 = vector.shape_cast %735 : vector<1x1x1x4xf16> to vector<4xf16>
%738 = vector.shape_cast %736 : vector<1x1x1x4xf16> to vector<4xf16>
%739 = amdgpu.mfma %737 * %738 + %734 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%740 = vector.extract %559[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%741 = vector.extract %469[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%742 = vector.shape_cast %740 : vector<1x1x1x4xf16> to vector<4xf16>
%743 = vector.shape_cast %741 : vector<1x1x1x4xf16> to vector<4xf16>
%744 = amdgpu.mfma %742 * %743 + %739 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%745 = vector.extract %559[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16>
%746 = vector.extract %469[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16>
%747 = vector.shape_cast %745 : vector<1x1x1x4xf16> to vector<4xf16>
%748 = vector.shape_cast %746 : vector<1x1x1x4xf16> to vector<4xf16>
%749 = amdgpu.mfma %747 * %748 + %744 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%750 = vector.shape_cast %749 : vector<4xf32> to vector<1x1x1x4xf32>
%751 = vector.insert %750, %727 [1, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32>
scf.yield %429, %468, %751 : vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32>
}
%73 = vector.extract %72#1[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%74 = vector.broadcast %73 : vector<1xf32> to vector<4x1xf32>
%75 = vector.insert %74, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%76 = vector.extract %72#1[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%77 = vector.broadcast %76 : vector<1xf32> to vector<4x1xf32>
%78 = vector.insert %77, %75 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%79 = vector.extract %72#1[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%80 = vector.broadcast %79 : vector<1xf32> to vector<4x1xf32>
%81 = vector.insert %80, %78 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%82 = vector.extract %72#1[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%83 = vector.broadcast %82 : vector<1xf32> to vector<4x1xf32>
%84 = vector.insert %83, %81 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%85 = vector.extract %72#1[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%86 = vector.broadcast %85 : vector<1xf32> to vector<4x1xf32>
%87 = vector.insert %86, %84 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%88 = vector.extract %72#1[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%89 = vector.broadcast %88 : vector<1xf32> to vector<4x1xf32>
%90 = vector.insert %89, %87 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%91 = vector.extract %72#1[0, 0] : vector<1xf32> from vector<2x1x1xf32>
%92 = vector.broadcast %91 : vector<1xf32> to vector<4x1xf32>
%93 = vector.insert %92, %90 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%94 = vector.extract %72#1[1, 0] : vector<1xf32> from vector<2x1x1xf32>
%95 = vector.broadcast %94 : vector<1xf32> to vector<4x1xf32>
%96 = vector.insert %95, %93 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32>
%97 = arith.divf %cst_5, %96 : vector<4x2x1x1x4x1xf32>
%98 = vector.transpose %97, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32>
%99 = arith.mulf %98, %72#2 : vector<2x4x1x1x1x4xf32>
%100 = arith.truncf %99 : vector<2x4x1x1x1x4xf32> to vector<2x4x1x1x1x4xf16>
%101 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%102 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%103 = vector.extract %100[0, 0, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %103, %alloc_13[%101, %102] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%104 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%105 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%106 = vector.extract %100[0, 1, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %106, %alloc_13[%104, %105] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%107 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%108 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%109 = vector.extract %100[0, 2, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %109, %alloc_13[%107, %108] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%110 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%111 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%112 = vector.extract %100[0, 3, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %112, %alloc_13[%110, %111] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%113 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%114 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%115 = vector.extract %100[1, 0, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %115, %alloc_13[%113, %114] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%116 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%117 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%118 = vector.extract %100[1, 1, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %118, %alloc_13[%116, %117] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%119 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%120 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%121 = vector.extract %100[1, 2, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %121, %alloc_13[%119, %120] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%122 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%123 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%124 = vector.extract %100[1, 3, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16>
vector.transfer_write %124, %alloc_13[%122, %123] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%subview_15 = memref.subview %10[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_16 = memref.subview %subview_15[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc_13, %subview_16 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_13 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_12 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc_11 : memref<64x64xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} {
%cst = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<4x2x1x1x4x1xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
%cst_2 = arith.constant dense<0xFF800000> : vector<2x1x1xf32>
%cst_3 = arith.constant dense<0.000000e+00> : vector<4x4x1x1x1x4xf16>
%cst_4 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf16>
%cst_5 = arith.constant dense<0.000000e+00> : vector<4x1x1x1x1x8xf16>
%cst_6 = arith.constant dense<1.000000e+00> : vector<4x2x1x1x4x1xf32>
%cst_7 = arith.constant dense<1.802980e-01> : vector<4x1x1x1x1x8xf16>
%cst_8 = arith.constant dense<0.000000e+00> : vector<2x1x1xf32>
%cst_9 = arith.constant dense<-3.40282347E+38> : vector<2x1x1xf32>
%cst_10 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf32>
%cst_11 = arith.constant 0.000000e+00 : f16
%c64 = arith.constant 64 : index
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_13 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc_14 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6:3 = util.assume.int
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
: index, index, index
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview = memref.subview %7[1, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_15 = memref.subview %7[0, 0, 0, 0, 0] [1, 2, 10, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) {
gpu.barrier
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16)>()[%arg2, %thread_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
%12 = vector.transfer_read %subview_15[%arg0, %arg1, %10, %11], %cst_11 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%13 = vector.insert_strided_slice %12, %cst_5 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 16)>()[%arg2, %thread_id_x]
%15 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
%16 = vector.transfer_read %subview_15[%arg0, %arg1, %14, %15], %cst_11 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = vector.insert_strided_slice %16, %13 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 32)>()[%arg2, %thread_id_x]
%19 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
%20 = vector.transfer_read %subview_15[%arg0, %arg1, %18, %19], %cst_11 {in_bounds = [true, true]} : memref<2x10x4096x64xf16, strided<[2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = vector.insert_strided_slice %20, %17 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 48
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment