Created
January 6, 2025 06:39
-
-
Save pashu123/b49299b19d14959244079d75ecc502ba to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [#map, #map1, #map2, #map3, #map4]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- // | |
module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // | |
module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable public @run_forward$async_dispatch_46 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) { | |
hal.executable.export public @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic ordinal(0) layout(#hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16> | |
%11 = tensor.empty() : tensor<2x4096x10x64xf16> | |
%12 = tensor.empty() : tensor<2x10x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%14, %13, %10, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%12 : tensor<2x10x4096x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<2x10x4096x64xf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x10x4096x64xf16>) outs(%11 : tensor<2x4096x10x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<2x4096x10x64xf16> | |
flow.dispatch.tensor.store %16, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%11 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, %c64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x?x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, %c0, 0], sizes = [1, 1, %c64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x?x4096xf16> | |
%14 = tensor.empty() : tensor<1x1x64x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%11, %12, %13, %cst : tensor<1x1x?x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x?x4096xf16>, f16) outs(%14 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x64x1x64xf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<1x1x64x64xf16>) outs(%16 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
%cast = tensor.cast %17 : tensor<1x64x1x64xf16> to tensor<1x?x1x?xf16> | |
%18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
flow.dispatch.tensor.store %cast, %9, offsets = [%workgroup_id_z, %18, %workgroup_id_y, %c0], sizes = [1, %c64, 1, %c64], strides = [1, 1, 1, 1] : tensor<1x?x1x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%11 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, %c64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x?x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%13 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, %c0, 0], sizes = [1, 1, %c64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x?x4096xf16> | |
%14 = tensor.empty() : tensor<1x1x64x64xf16> | |
%15 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%11, %12, %13, %cst : tensor<1x1x?x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x?x4096xf16>, f16) outs(%14 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x64x1x64xf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<1x1x64x64xf16>) outs(%16 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
%cast = tensor.cast %17 : tensor<1x64x1x64xf16> to tensor<1x?x1x?xf16> | |
flow.dispatch.tensor.store %cast, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, %c0], sizes = [1, %c64, 1, %c64], strides = [1, 1, 1, 1] : tensor<1x?x1x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, %c0], sizes = [1, %c64, 1, %c64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x?x1x?xf16> | |
%cast = tensor.cast %11 : tensor<1x?x1x?xf16> to tensor<1x64x1x64xf16> | |
%workgroup_id_x_0 = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y_1 = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z_2 = hal.interface.workgroup.id[2] : index | |
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x_0] | |
%13 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z_2, %workgroup_id_y_1, %12, 0], sizes = [1, 1, 1, %c64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x?x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z_2, %workgroup_id_y_1, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%15 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z_2, %workgroup_id_y_1, %c0, 0], sizes = [1, 1, %c64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x?x4096xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf16> | |
%17 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%13, %14, %15, %cst : tensor<1x1x?x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x?x4096xf16>, f16) outs(%16 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x64x64xf16>) outs(%cast : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
%cast_3 = tensor.cast %18 : tensor<1x64x1x64xf16> to tensor<1x?x1x?xf16> | |
flow.dispatch.tensor.store %cast_3, %9, offsets = [%workgroup_id_z_2, %12, %workgroup_id_y_1, %c0], sizes = [1, %c64, 1, %c64], strides = [1, 1, 1, 1] : tensor<1x?x1x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%workgroup_id_x_0 = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y_1 = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z_2 = hal.interface.workgroup.id[2] : index | |
%12 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x_0] | |
%13 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z_2, %workgroup_id_y_1, %12, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%14 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z_2, %workgroup_id_y_1, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%15 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z_2, %workgroup_id_y_1, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf16> | |
%17 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%13, %14, %15, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%16 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %18, %9, offsets = [%workgroup_id_z_2, %12, %workgroup_id_y_1, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%12, %13, %14, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %17, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ReorderWorkgroupsPass (iree-codegen-reorder-workgroups) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = iree_linalg_ext.attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%12, %13, %14, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %17, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%18 = linalg.fill ins(%cst_0 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%12, %13, %14, %cst : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%cst_4 = arith.constant 1.000000e+00 : f32 | |
%24 = arith.divf %cst_4, %in : f32 | |
%25 = arith.mulf %24, %in_3 : f32 | |
%26 = arith.truncf %25 : f32 to f16 | |
linalg.yield %26 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %23, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%12, %13, %14, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%24 = arith.divf %cst, %in : f32 | |
%25 = arith.mulf %24, %in_3 : f32 | |
%26 = arith.truncf %25 : f32 to f16 | |
linalg.yield %26 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %23, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%12, %13, %14, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21#2, %21#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%24 = arith.divf %cst, %in : f32 | |
%25 = arith.mulf %24, %in_3 : f32 | |
%26 = arith.truncf %25 : f32 to f16 | |
linalg.yield %26 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %23, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<1x1x64x64xf16> | |
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%23 = tensor.empty() : tensor<1x1x4096x64xf16> | |
%24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%13 : tensor<1x1x4096x64xf16>) outs(%23 : tensor<1x1x4096x64xf16>) -> tensor<1x1x4096x64xf16> | |
%25 = tensor.empty() : tensor<1x1x64x4096xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%14 : tensor<1x1x64x4096xf16>) outs(%25 : tensor<1x1x64x4096xf16>) -> tensor<1x1x64x4096xf16> | |
%27:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %24, %26, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x4096x64xf16>, tensor<1x1x64x4096xf16>, f16) outs(%18, %19, %20 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg0: f32): | |
iree_linalg_ext.yield %arg0 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27#2, %27#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%30 = arith.divf %cst, %in : f32 | |
%31 = arith.mulf %30, %in_3 : f32 | |
%32 = arith.truncf %31 : f32 to f16 | |
linalg.yield %32 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%28 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %29, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<1x1x64x64xf16> | |
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%23:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%26 = tensor.empty() : tensor<1x1x64x64xf16> | |
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%28 = tensor.empty() : tensor<1x1x64x64xf16> | |
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg1, %arg2, %arg3 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%26 = arith.divf %cst, %in : f32 | |
%27 = arith.mulf %26, %in_3 : f32 | |
%28 = arith.truncf %27 : f32 to f16 | |
linalg.yield %28 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %25, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<1x1x64x64xf16> | |
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%23:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%26 = tensor.empty() : tensor<1x1x64x64xf16> | |
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%28 = tensor.empty() : tensor<1x1x64x64xf16> | |
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg1, %arg2, %arg3 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%26 = arith.divf %cst, %in : f32 | |
%27 = arith.mulf %26, %in_3 : f32 | |
%28 = arith.truncf %27 : f32 to f16 | |
linalg.yield %28 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %25, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<1x1x64x64xf16> | |
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%21 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%23:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%26 = tensor.empty() : tensor<1x1x64x64xf16> | |
%27 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%26 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%28 = tensor.empty() : tensor<1x1x64x64xf16> | |
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%28 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%30:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%22, %27, %29, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg1, %arg2, %arg3 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
scf.yield %30#0, %30#1, %30#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%26 = arith.divf %cst, %in : f32 | |
%27 = arith.mulf %26, %in_3 : f32 | |
%28 = arith.truncf %27 : f32 to f16 | |
linalg.yield %28 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %25, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%27:3 = iree_linalg_ext.online_attention {decomposition_config = {pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}, qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}}, indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>], lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>} ins(%21, %25, %26, %cst_2 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>, f16) outs(%arg1, %arg2, %arg3 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
^bb0(%arg4: f32): | |
iree_linalg_ext.yield %arg4 : f32 | |
} -> tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
scf.yield %27#0, %27#1, %27#2 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%25 = arith.divf %cst, %in : f32 | |
%26 = arith.mulf %25, %in_3 : f32 | |
%27 = arith.truncf %26 : f32 to f16 | |
linalg.yield %27 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %24, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 1.000000e+00 : f32 | |
%cst_0 = arith.constant -3.40282347E+38 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%cst_2 = arith.constant 1.250000e-01 : f16 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_1 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_0 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%cst_4 = arith.constant 1.442380e+00 : f16 | |
%27 = arith.mulf %cst_2, %cst_4 : f16 | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : f16) outs(%21 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%42 = arith.mulf %in, %out : f16 | |
linalg.yield %42 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%29 = tensor.empty() : tensor<1x1x64x64xf32> | |
%cst_5 = arith.constant 0.000000e+00 : f32 | |
%30 = linalg.fill ins(%cst_5 : f32) outs(%29 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%28, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%30 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_6: f16, %out: f32): | |
%42 = arith.extf %in : f16 to f32 | |
%43 = arith.extf %in_6 : f16 to f32 | |
%44 = arith.mulf %42, %43 : f32 | |
%45 = arith.addf %44, %out : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%31 : tensor<1x1x64x64xf32>) { | |
^bb0(%out: f32): | |
linalg.yield %out : f32 | |
} -> tensor<1x1x64x64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.maximumf %in, %out : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<1x1x64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.subf %out, %in : f32 | |
%43 = math.exp2 %42 : f32 | |
linalg.yield %43 : f32 | |
} -> tensor<1x1x64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.mulf %in, %out : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<1x1x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<1x1x64xf32>) outs(%32 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.subf %out, %in : f32 | |
%43 = math.exp2 %42 : f32 | |
linalg.yield %43 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%36 : tensor<1x1x64x64xf32>) outs(%35 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.addf %in, %out : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<1x1x64xf32> | |
%38 = tensor.empty() : tensor<1x1x64x64xf16> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%42 = arith.truncf %in : f32 to f16 | |
linalg.yield %42 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%42 = arith.mulf %in, %out : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%39, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%40 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_6: f16, %out: f32): | |
%42 = arith.extf %in : f16 to f32 | |
%43 = arith.extf %in_6 : f16 to f32 | |
%44 = arith.mulf %42, %43 : f32 | |
%45 = arith.addf %44, %out : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<1x1x64x64xf32> | |
scf.yield %41, %33, %37 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%25 = arith.divf %cst, %in : f32 | |
%26 = arith.mulf %25, %in_3 : f32 | |
%27 = arith.truncf %26 : f32 to f16 | |
linalg.yield %27 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %24, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%21 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%40 = arith.mulf %in, %out : f16 | |
linalg.yield %40 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%28 = tensor.empty() : tensor<1x1x64x64xf32> | |
%29 = linalg.fill ins(%cst_2 : f32) outs(%28 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%27, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%29 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%40 = arith.extf %in : f16 to f32 | |
%41 = arith.extf %in_4 : f16 to f32 | |
%42 = arith.mulf %40, %41 : f32 | |
%43 = arith.addf %42, %out : f32 | |
linalg.yield %43 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%30 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.maximumf %in, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%31 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.subf %out, %in : f32 | |
%41 = math.exp2 %40 : f32 | |
linalg.yield %41 : f32 | |
} -> tensor<1x1x64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.mulf %in, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31 : tensor<1x1x64xf32>) outs(%30 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.subf %out, %in : f32 | |
%41 = math.exp2 %40 : f32 | |
linalg.yield %41 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%34 : tensor<1x1x64x64xf32>) outs(%33 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.addf %in, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64xf32> | |
%36 = tensor.empty() : tensor<1x1x64x64xf16> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34 : tensor<1x1x64x64xf32>) outs(%36 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%40 = arith.truncf %in : f32 to f16 | |
linalg.yield %40 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%40 = arith.mulf %in, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%37, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%38 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%40 = arith.extf %in : f16 to f32 | |
%41 = arith.extf %in_4 : f16 to f32 | |
%42 = arith.mulf %40, %41 : f32 | |
%43 = arith.addf %42, %out : f32 | |
linalg.yield %43 : f32 | |
} -> tensor<1x1x64x64xf32> | |
scf.yield %39, %31, %35 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%25 = arith.divf %cst_0, %in : f32 | |
%26 = arith.mulf %25, %in_3 : f32 | |
%27 = arith.truncf %26 : f32 to f16 | |
linalg.yield %27 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %24, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%21 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%37 = arith.mulf %in, %out : f16 | |
linalg.yield %37 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%27, %25 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%18 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%37 = arith.extf %in : f16 to f32 | |
%38 = arith.extf %in_4 : f16 to f32 | |
%39 = arith.mulf %37, %38 : f32 | |
%40 = arith.addf %39, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%28 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.maximumf %in, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x64xf32> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%29 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.subf %out, %in : f32 | |
%38 = math.exp2 %37 : f32 | |
linalg.yield %38 : f32 | |
} -> tensor<1x1x64xf32> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.mulf %in, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29 : tensor<1x1x64xf32>) outs(%28 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.subf %out, %in : f32 | |
%38 = math.exp2 %37 : f32 | |
linalg.yield %38 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%31 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.addf %in, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%37 = arith.truncf %in : f32 to f16 | |
linalg.yield %37 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%37 = arith.mulf %in, %out : f32 | |
linalg.yield %37 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%34, %26 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%35 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%37 = arith.extf %in : f16 to f32 | |
%38 = arith.extf %in_4 : f16 to f32 | |
%39 = arith.mulf %37, %38 : f32 | |
%40 = arith.addf %39, %out : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<1x1x64x64xf32> | |
scf.yield %36, %29, %33 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22#2, %22#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%25 = arith.divf %cst_0, %in : f32 | |
%26 = arith.mulf %25, %in_3 : f32 | |
%27 = arith.truncf %26 : f32 to f16 | |
linalg.yield %27 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %24, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUConfigureTensorLayoutsPass (iree-llvmgpu-configure-tensor-layouts) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%23:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%28 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%48 = arith.mulf %in, %out : f16 | |
linalg.yield %48 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%31 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%32 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%33 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%31, %32 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%33 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.maximumf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64xf32> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.addf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%48 = arith.truncf %in : f32 to f16 | |
linalg.yield %48 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%43 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16> | |
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23#2, %23#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%26 = arith.divf %cst_0, %in : f32 | |
%27 = arith.mulf %26, %in_3 : f32 | |
%28 = arith.truncf %27 : f32 to f16 | |
linalg.yield %28 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%24 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %25, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.fill ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) -> tensor<1x1x64x64xf32> | |
%19 = linalg.fill ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) -> tensor<1x1x64xf32> | |
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%29 = arith.mulf %in, %out : f16 | |
linalg.yield %29 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%25 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%26:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%29 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%31 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) -> tensor<1x1x64x64xf16> | |
%32 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%33 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%33, %24 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%25 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.maximumf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64xf32> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.addf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%48 = arith.truncf %in : f32 to f16 | |
linalg.yield %48 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%43 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16> | |
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26#2, %26#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%29 = arith.divf %cst_0, %in : f32 | |
%30 = arith.mulf %29, %in_3 : f32 | |
%31 = arith.truncf %30 : f32 to f16 | |
linalg.yield %31 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %28, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After LinalgGeneralizeNamedOpsPass (linalg-generalize-named-ops) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64x64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%22 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%29 = arith.mulf %in, %out : f16 | |
linalg.yield %29 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%25 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%26:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_3 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%32 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 1], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 8], element_tile = [1, 1, 1, 8], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 8, 1]>) : tensor<1x1x64x64xf16> | |
%33 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%33, %24 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%25 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%35 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.maximumf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64xf32> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<1x1x64xf32>) outs(%35 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.subf %out, %in : f32 | |
%49 = math.exp2 %48 : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%38 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.addf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%48 = arith.truncf %in : f32 to f16 | |
linalg.yield %48 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%48 = arith.mulf %in, %out : f32 | |
linalg.yield %48 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%43 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 1, 1], batch_tile = [1, 1, 4, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 0, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<1x1x64x64xf16> | |
%44 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf16> | |
%45 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%43, %44 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%45 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_4: f16, %out: f32): | |
%48 = arith.extf %in : f16 to f32 | |
%49 = arith.extf %in_4 : f16 to f32 | |
%50 = arith.mulf %48, %49 : f32 | |
%51 = arith.addf %50, %out : f32 | |
linalg.yield %51 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%47 = iree_vector_ext.to_layout %46 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1, 2, 1], batch_tile = [1, 1, 2, 4], outer_tile = [1, 1, 1, 1], thread_tile = [1, 1, 16, 4], element_tile = [1, 1, 1, 4], subgroup_strides = [0, 0, 1, 0], thread_strides = [0, 0, 1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<1x1x64x64xf32> | |
scf.yield %47, %36, %40 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26#2, %26#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%29 = arith.divf %cst_0, %in : f32 | |
%30 = arith.mulf %29, %in_3 : f32 | |
%31 = arith.truncf %30 : f32 to f16 | |
linalg.yield %31 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %28, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After VectorExtFoldUnitExtentDimsPass (iree-vector-ext-fold-unit-extent-dims) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf16> | |
%16 = tensor.empty() : tensor<1x1x64x64xf32> | |
%17 = tensor.empty() : tensor<1x1x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%16 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64x64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x64xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%extracted_slice = tensor.extract_slice %21[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%22 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%23 = tensor.empty() : tensor<1x1x64x64xf16> | |
%inserted_slice = tensor.insert_slice %22 into %23[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : f16) outs(%inserted_slice : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
%32 = arith.mulf %in, %out : f16 | |
linalg.yield %32 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%extracted_slice_3 = tensor.extract_slice %24[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%25 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%26 = tensor.empty() : tensor<1x1x64x64xf16> | |
%inserted_slice_4 = tensor.insert_slice %25 into %26[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %18[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%27 = iree_vector_ext.to_layout %extracted_slice_5 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%28 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_6 = tensor.insert_slice %27 into %28[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%29:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %20) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_7 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%extracted_slice_8 = tensor.extract_slice %32[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%33 = iree_vector_ext.to_layout %extracted_slice_8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_9 : tensor<1x1x64x64xf16>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x1x64x64xf16> | |
%extracted_slice_10 = tensor.extract_slice %34[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%35 = iree_vector_ext.to_layout %extracted_slice_10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%36 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%37 = tensor.empty() : tensor<1x1x64x64xf16> | |
%inserted_slice_11 = tensor.insert_slice %36 into %37[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} ins(%inserted_slice_11, %inserted_slice_4 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%inserted_slice_6 : tensor<1x1x64x64xf32>) attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_21: f16, %out: f32): | |
%57 = arith.extf %in : f16 to f32 | |
%58 = arith.extf %in_21 : f16 to f32 | |
%59 = arith.mulf %57, %58 : f32 | |
%60 = arith.addf %59, %out : f32 | |
linalg.yield %60 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%extracted_slice_12 = tensor.extract_slice %38[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%39 = iree_vector_ext.to_layout %extracted_slice_12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%40 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_13 = tensor.insert_slice %39 into %40[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%inserted_slice_13 : tensor<1x1x64x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.maximumf %in, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<1x1x64xf32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%41 : tensor<1x1x64xf32>) outs(%arg2 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.subf %out, %in : f32 | |
%58 = math.exp2 %57 : f32 | |
linalg.yield %58 : f32 | |
} -> tensor<1x1x64xf32> | |
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%42 : tensor<1x1x64xf32>) outs(%arg3 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.mulf %in, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<1x1x64xf32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%41 : tensor<1x1x64xf32>) outs(%inserted_slice_13 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.subf %out, %in : f32 | |
%58 = math.exp2 %57 : f32 | |
linalg.yield %58 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%44 : tensor<1x1x64x64xf32>) outs(%43 : tensor<1x1x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.addf %in, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<1x1x64xf32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%57 = arith.truncf %in : f32 to f16 | |
linalg.yield %57 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<1x1x64xf32>) outs(%arg1 : tensor<1x1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%57 = arith.mulf %in, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%48 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%49 = tensor.empty() : tensor<1x1x64x64xf16> | |
%inserted_slice_14 = tensor.insert_slice %48 into %49[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16> | |
%extracted_slice_15 = tensor.extract_slice %46[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%50 = iree_vector_ext.to_layout %extracted_slice_15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%51 = tensor.empty() : tensor<1x1x64x64xf16> | |
%inserted_slice_16 = tensor.insert_slice %50 into %51[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x1x64x64xf16> | |
%extracted_slice_17 = tensor.extract_slice %47[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%52 = iree_vector_ext.to_layout %extracted_slice_17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%53 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_18 = tensor.insert_slice %52 into %53[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%inserted_slice_14, %inserted_slice_16 : tensor<1x1x64x64xf16>, tensor<1x1x64x64xf16>) outs(%inserted_slice_18 : tensor<1x1x64x64xf32>) attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>} { | |
^bb0(%in: f16, %in_21: f16, %out: f32): | |
%57 = arith.extf %in : f16 to f32 | |
%58 = arith.extf %in_21 : f16 to f32 | |
%59 = arith.mulf %57, %58 : f32 | |
%60 = arith.addf %59, %out : f32 | |
linalg.yield %60 : f32 | |
} -> tensor<1x1x64x64xf32> | |
%extracted_slice_19 = tensor.extract_slice %54[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%55 = iree_vector_ext.to_layout %extracted_slice_19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%56 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_20 = tensor.insert_slice %55 into %56[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
scf.yield %inserted_slice_20, %41, %45 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29#2, %29#0 : tensor<1x1x64xf32>, tensor<1x1x64x64xf32>) outs(%15 : tensor<1x1x64x64xf16>) { | |
^bb0(%in: f32, %in_7: f32, %out: f16): | |
%32 = arith.divf %cst_0, %in : f32 | |
%33 = arith.mulf %32, %in_7 : f32 | |
%34 = arith.truncf %33 : f32 to f16 | |
linalg.yield %34 : f16 | |
} -> tensor<1x1x64x64xf16> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<1x1x64x64xf16>) outs(%11 : tensor<1x64x1x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %31, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf32> | |
%16 = tensor.empty() : tensor<1x1x64xf32> | |
%17 = tensor.empty() : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%19 = tensor.empty() : tensor<64xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<64xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%21 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_4 = tensor.insert_slice %22 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%23 = tensor.empty() : tensor<64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<64x64xf16>) outs(%23 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x64xf16> | |
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%26 = tensor.empty() : tensor<64x64xf16> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %25 : f16, tensor<64x64xf16>) outs(%26 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_9: f16, %out: f16): | |
%34 = arith.mulf %in, %in_9 : f16 | |
linalg.yield %34 : f16 | |
} -> tensor<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%29 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%30:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %inserted_slice, %arg2 = %inserted_slice_3, %arg3 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%34 = tensor.empty() : tensor<64x64xf16> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_10 : tensor<64x64xf16>) outs(%34 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x64xf16> | |
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_11 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_11[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%37 = tensor.empty() : tensor<64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_12 : tensor<64x64xf16>) outs(%37 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x64xf16> | |
%39 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%40 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%40, %28 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%29 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_20: f16, %out: f32): | |
%61 = arith.extf %in : f16 to f32 | |
%62 = arith.extf %in_20 : f16 to f32 | |
%63 = arith.mulf %61, %62 : f32 | |
%64 = arith.addf %63, %out : f32 | |
linalg.yield %64 : f32 | |
} -> tensor<64x64xf32> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%43 = tensor.empty() : tensor<64x64xf32> | |
%extracted_slice_13 = tensor.extract_slice %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%42 : tensor<64x64xf32>) outs(%extracted_slice_13 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%61 = arith.maximumf %in, %out : f32 | |
linalg.yield %61 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_14 = tensor.insert_slice %44 into %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%extracted_slice_15 = tensor.extract_slice %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%45 = tensor.empty() : tensor<64xf32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%44, %extracted_slice_15 : tensor<64xf32>, tensor<64xf32>) outs(%45 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_20: f32, %out: f32): | |
%61 = arith.subf %in_20, %in : f32 | |
%62 = math.exp2 %61 : f32 | |
linalg.yield %62 : f32 | |
} -> tensor<64xf32> | |
%extracted_slice_16 = tensor.extract_slice %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%47 = tensor.empty() : tensor<64xf32> | |
%48 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%46, %extracted_slice_16 : tensor<64xf32>, tensor<64xf32>) outs(%47 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_20: f32, %out: f32): | |
%61 = arith.mulf %in, %in_20 : f32 | |
linalg.yield %61 : f32 | |
} -> tensor<64xf32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %42 : tensor<64xf32>, tensor<64x64xf32>) outs(%43 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_20: f32, %out: f32): | |
%61 = arith.subf %in_20, %in : f32 | |
%62 = math.exp2 %61 : f32 | |
linalg.yield %62 : f32 | |
} -> tensor<64x64xf32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%49 : tensor<64x64xf32>) outs(%48 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%61 = arith.addf %in, %out : f32 | |
linalg.yield %61 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_17 = tensor.insert_slice %50 into %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%51 = tensor.empty() : tensor<64x64xf16> | |
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%49 : tensor<64x64xf32>) outs(%51 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%61 = arith.truncf %in : f32 to f16 | |
linalg.yield %61 : f16 | |
} -> tensor<64x64xf16> | |
%extracted_slice_18 = tensor.extract_slice %arg1[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%53 = tensor.empty() : tensor<64x64xf32> | |
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%46, %extracted_slice_18 : tensor<64xf32>, tensor<64x64xf32>) outs(%53 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_20: f32, %out: f32): | |
%61 = arith.mulf %in, %in_20 : f32 | |
linalg.yield %61 : f32 | |
} -> tensor<64x64xf32> | |
%55 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%56 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%57 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%55, %56 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%57 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_20: f16, %out: f32): | |
%61 = arith.extf %in : f16 to f32 | |
%62 = arith.extf %in_20 : f16 to f32 | |
%63 = arith.mulf %61, %62 : f32 | |
%64 = arith.addf %63, %out : f32 | |
linalg.yield %64 : f32 | |
} -> tensor<64x64xf32> | |
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%60 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_19 = tensor.insert_slice %59 into %60[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
scf.yield %inserted_slice_19, %inserted_slice_14, %inserted_slice_17 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %30#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%extracted_slice_6 = tensor.extract_slice %30#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%31 = tensor.empty() : tensor<64x64xf16> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<64xf32>, tensor<64x64xf32>) outs(%31 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_9: f32, %out: f16): | |
%34 = arith.divf %cst_0, %in : f32 | |
%35 = arith.mulf %34, %in_9 : f32 | |
%36 = arith.truncf %35 : f32 to f16 | |
linalg.yield %36 : f16 | |
} -> tensor<64x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> to tensor<64x64xf16> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<64x64xf16>) outs(%extracted_slice_7 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice_8 = tensor.insert_slice %33 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice_8, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf32> | |
%16 = tensor.empty() : tensor<1x1x64xf32> | |
%17 = tensor.empty() : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%19 = tensor.empty() : tensor<64xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%21 = tensor.empty() : tensor<64xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%21 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_4 = tensor.insert_slice %22 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%23 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%24 = tensor.empty() : tensor<64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %23 : f16, tensor<64x64xf16>) outs(%24 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_8: f16, %out: f16): | |
%31 = arith.mulf %in, %in_8 : f16 | |
linalg.yield %31 : f16 | |
} -> tensor<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%27 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%28:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %inserted_slice, %arg2 = %inserted_slice_3, %arg3 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice_8 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%31 = iree_vector_ext.to_layout %extracted_slice_9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_10 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%32 = iree_vector_ext.to_layout %extracted_slice_11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%33 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%33, %26 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%27 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_19: f16, %out: f32): | |
%54 = arith.extf %in : f16 to f32 | |
%55 = arith.extf %in_19 : f16 to f32 | |
%56 = arith.mulf %54, %55 : f32 | |
%57 = arith.addf %56, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<64x64xf32> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%36 = tensor.empty() : tensor<64x64xf32> | |
%extracted_slice_12 = tensor.extract_slice %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%extracted_slice_12 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%54 = arith.maximumf %in, %out : f32 | |
linalg.yield %54 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_13 = tensor.insert_slice %37 into %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%extracted_slice_14 = tensor.extract_slice %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%38 = tensor.empty() : tensor<64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%37, %extracted_slice_14 : tensor<64xf32>, tensor<64xf32>) outs(%38 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_19: f32, %out: f32): | |
%54 = arith.subf %in_19, %in : f32 | |
%55 = math.exp2 %54 : f32 | |
linalg.yield %55 : f32 | |
} -> tensor<64xf32> | |
%extracted_slice_15 = tensor.extract_slice %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%40 = tensor.empty() : tensor<64xf32> | |
%41 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39, %extracted_slice_15 : tensor<64xf32>, tensor<64xf32>) outs(%40 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_19: f32, %out: f32): | |
%54 = arith.mulf %in, %in_19 : f32 | |
linalg.yield %54 : f32 | |
} -> tensor<64xf32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%37, %35 : tensor<64xf32>, tensor<64x64xf32>) outs(%36 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_19: f32, %out: f32): | |
%54 = arith.subf %in_19, %in : f32 | |
%55 = math.exp2 %54 : f32 | |
linalg.yield %55 : f32 | |
} -> tensor<64x64xf32> | |
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%42 : tensor<64x64xf32>) outs(%41 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%54 = arith.addf %in, %out : f32 | |
linalg.yield %54 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_16 = tensor.insert_slice %43 into %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%44 = tensor.empty() : tensor<64x64xf16> | |
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%42 : tensor<64x64xf32>) outs(%44 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%54 = arith.truncf %in : f32 to f16 | |
linalg.yield %54 : f16 | |
} -> tensor<64x64xf16> | |
%extracted_slice_17 = tensor.extract_slice %arg1[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%46 = tensor.empty() : tensor<64x64xf32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39, %extracted_slice_17 : tensor<64xf32>, tensor<64x64xf32>) outs(%46 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_19: f32, %out: f32): | |
%54 = arith.mulf %in, %in_19 : f32 | |
linalg.yield %54 : f32 | |
} -> tensor<64x64xf32> | |
%48 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%49 = iree_vector_ext.to_layout %45 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%48, %49 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%50 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_19: f16, %out: f32): | |
%54 = arith.extf %in : f16 to f32 | |
%55 = arith.extf %in_19 : f16 to f32 | |
%56 = arith.mulf %54, %55 : f32 | |
%57 = arith.addf %56, %out : f32 | |
linalg.yield %57 : f32 | |
} -> tensor<64x64xf32> | |
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%53 = tensor.empty() : tensor<1x1x64x64xf32> | |
%inserted_slice_18 = tensor.insert_slice %52 into %53[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
scf.yield %inserted_slice_18, %inserted_slice_13, %inserted_slice_16 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %28#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%extracted_slice_6 = tensor.extract_slice %28#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%29 = tensor.empty() : tensor<64x64xf16> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<64xf32>, tensor<64x64xf32>) outs(%29 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_8: f32, %out: f16): | |
%31 = arith.divf %cst_0, %in : f32 | |
%32 = arith.mulf %31, %in_8 : f32 | |
%33 = arith.truncf %32 : f32 to f16 | |
linalg.yield %33 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice_7 = tensor.insert_slice %30 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice_7, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<1x1x64x64xf32> | |
%16 = tensor.empty() : tensor<1x1x64xf32> | |
%17 = tensor.empty() : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%inserted_slice = tensor.insert_slice %18 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
%19 = tensor.empty() : tensor<64xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_3 = tensor.insert_slice %20 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_4 = tensor.insert_slice %21 into %16[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%22 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%23 = tensor.empty() : tensor<64x64xf16> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %22 : f16, tensor<64x64xf16>) outs(%23 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_8: f16, %out: f16): | |
%29 = arith.mulf %in, %in_8 : f16 | |
linalg.yield %29 : f16 | |
} -> tensor<64x64xf16> | |
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%26 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%27:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %inserted_slice, %arg2 = %inserted_slice_3, %arg3 = %inserted_slice_4) -> (tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32>) { | |
%extracted_slice_8 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%29 = iree_vector_ext.to_layout %extracted_slice_9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_10 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%30 = iree_vector_ext.to_layout %extracted_slice_11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%31 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%31, %25 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%26 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_18: f16, %out: f32): | |
%46 = arith.extf %in : f16 to f32 | |
%47 = arith.extf %in_18 : f16 to f32 | |
%48 = arith.mulf %46, %47 : f32 | |
%49 = arith.addf %48, %out : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<64x64xf32> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%extracted_slice_12 = tensor.extract_slice %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%33 : tensor<64x64xf32>) outs(%extracted_slice_12 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%46 = arith.maximumf %in, %out : f32 | |
linalg.yield %46 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_13 = tensor.insert_slice %34 into %arg2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%34, %extracted_slice_12 : tensor<64xf32>, tensor<64xf32>) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_18: f32, %out: f32): | |
%46 = arith.subf %in_18, %in : f32 | |
%47 = math.exp2 %46 : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64xf32> | |
%extracted_slice_14 = tensor.extract_slice %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35, %extracted_slice_14 : tensor<64xf32>, tensor<64xf32>) outs(%19 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_18: f32, %out: f32): | |
%46 = arith.mulf %in, %in_18 : f32 | |
linalg.yield %46 : f32 | |
} -> tensor<64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%34, %33 : tensor<64xf32>, tensor<64x64xf32>) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_18: f32, %out: f32): | |
%46 = arith.subf %in_18, %in : f32 | |
%47 = math.exp2 %46 : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%37 : tensor<64x64xf32>) outs(%36 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%46 = arith.addf %in, %out : f32 | |
linalg.yield %46 : f32 | |
} -> tensor<64xf32> | |
%inserted_slice_15 = tensor.insert_slice %38 into %arg3[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<64xf32> into tensor<1x1x64xf32> | |
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%37 : tensor<64x64xf32>) outs(%23 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%46 = arith.truncf %in : f32 to f16 | |
linalg.yield %46 : f16 | |
} -> tensor<64x64xf16> | |
%extracted_slice_16 = tensor.extract_slice %arg1[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %extracted_slice_16 : tensor<64xf32>, tensor<64x64xf32>) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_18: f32, %out: f32): | |
%46 = arith.mulf %in, %in_18 : f32 | |
linalg.yield %46 : f32 | |
} -> tensor<64x64xf32> | |
%41 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%42 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%43 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%41, %42 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%43 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_18: f16, %out: f32): | |
%46 = arith.extf %in : f16 to f32 | |
%47 = arith.extf %in_18 : f16 to f32 | |
%48 = arith.mulf %46, %47 : f32 | |
%49 = arith.addf %48, %out : f32 | |
linalg.yield %49 : f32 | |
} -> tensor<64x64xf32> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%inserted_slice_17 = tensor.insert_slice %45 into %15[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<64x64xf32> into tensor<1x1x64x64xf32> | |
scf.yield %inserted_slice_17, %inserted_slice_13, %inserted_slice_15 : tensor<1x1x64x64xf32>, tensor<1x1x64xf32>, tensor<1x1x64xf32> | |
} | |
%extracted_slice_5 = tensor.extract_slice %27#2[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x1x64xf32> to tensor<64xf32> | |
%extracted_slice_6 = tensor.extract_slice %27#0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf32> to tensor<64x64xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<64xf32>, tensor<64x64xf32>) outs(%23 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_8: f32, %out: f16): | |
%29 = arith.divf %cst_0, %in : f32 | |
%30 = arith.mulf %29, %in_8 : f32 | |
%31 = arith.truncf %30 : f32 to f16 | |
linalg.yield %31 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice_7 = tensor.insert_slice %28 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice_7, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%21 = tensor.empty() : tensor<64x64xf16> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_3: f16, %out: f16): | |
%27 = arith.mulf %in, %in_3 : f16 | |
linalg.yield %27 : f16 | |
} -> tensor<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%27 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%28 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg1 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.maximumf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg2 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.addf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%44 = arith.truncf %in : f32 to f16 | |
linalg.yield %44 : f16 | |
} -> tensor<64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg3 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64x64xf32> | |
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%27 = arith.divf %cst_0, %in : f32 | |
%28 = arith.mulf %27, %in_3 : f32 | |
%29 = arith.truncf %28 : f32 to f16 | |
linalg.yield %29 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %26 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%21 = tensor.empty() : tensor<64x64xf16> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_3: f16, %out: f16): | |
%27 = arith.mulf %in, %in_3 : f16 | |
linalg.yield %27 : f16 | |
} -> tensor<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%27 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%28 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg1 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.maximumf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg2 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.addf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%44 = arith.truncf %in : f32 to f16 | |
linalg.yield %44 : f16 | |
} -> tensor<64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg3 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64x64xf32> | |
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%27 = arith.divf %cst_0, %in : f32 | |
%28 = arith.mulf %27, %in_3 : f32 | |
%29 = arith.truncf %28 : f32 to f16 | |
linalg.yield %29 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %26 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant -3.40282347E+38 : f32 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : f32) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_1 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = iree_vector_ext.to_layout %extracted_slice to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%21 = tensor.empty() : tensor<64x64xf16> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst, %20 : f16, tensor<64x64xf16>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_3: f16, %out: f16): | |
%27 = arith.mulf %in, %in_3 : f16 | |
linalg.yield %27 : f16 | |
} -> tensor<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%24 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_3 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%27 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_5[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%28 = iree_vector_ext.to_layout %extracted_slice_6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<64x64xf16> | |
%29 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%29, %23 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%24 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%31 : tensor<64x64xf32>) outs(%arg1 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.maximumf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%32, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64xf32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%33, %arg2 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32, %31 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.subf %in_7, %in : f32 | |
%45 = math.exp2 %44 : f32 | |
linalg.yield %45 : f32 | |
} -> tensor<64x64xf32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%35 : tensor<64x64xf32>) outs(%34 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%44 = arith.addf %in, %out : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%44 = arith.truncf %in : f32 to f16 | |
linalg.yield %44 : f16 | |
} -> tensor<64x64xf16> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%33, %arg3 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_7: f32, %out: f32): | |
%44 = arith.mulf %in, %in_7 : f32 | |
linalg.yield %44 : f32 | |
} -> tensor<64x64xf32> | |
%39 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x64xf16> | |
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf16> | |
%41 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%39, %40 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%41 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_7: f16, %out: f32): | |
%44 = arith.extf %in : f16 to f32 | |
%45 = arith.extf %in_7 : f16 to f32 | |
%46 = arith.mulf %44, %45 : f32 | |
%47 = arith.addf %46, %out : f32 | |
linalg.yield %47 : f32 | |
} -> tensor<64x64xf32> | |
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %32, %36, %43 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%25#1, %25#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%21 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_3: f32, %out: f16): | |
%27 = arith.divf %cst_0, %in : f32 | |
%28 = arith.mulf %27, %in_3 : f32 | |
%29 = arith.truncf %28 : f32 to f16 | |
linalg.yield %29 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %26 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant 1.802980e-01 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_1 = arith.constant 1.000000e+00 : f32 | |
%cst_2 = arith.constant -3.40282347E+38 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_3 : f32) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_2 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst_3 : f32) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%22 = tensor.empty() : tensor<64x64xf16> | |
%23 = vector.transfer_write %21, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%24 = tensor.empty() : tensor<64x64xf16> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_0, %23 : f16, tensor<64x64xf16>) outs(%24 : tensor<64x64xf16>) { | |
^bb0(%in: f16, %in_4: f16, %out: f16): | |
%36 = arith.mulf %in, %in_4 : f16 | |
linalg.yield %36 : f16 | |
} -> tensor<64x64xf16> | |
%26 = vector.transfer_read %25[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%27 = iree_vector_ext.to_layout %26 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%28 = tensor.empty() : tensor<64x64xf16> | |
%29 = vector.transfer_write %27, %28[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%30 = vector.transfer_read %16[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%32 = tensor.empty() : tensor<64x64xf32> | |
%33 = vector.transfer_write %31, %32[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%34:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_4 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%36 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%37 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice_6 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%38 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%39 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%40 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%41 = tensor.empty() : tensor<64x64xf16> | |
%42 = vector.transfer_write %40, %41[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%42, %29 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%33 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_8: f16, %out: f32): | |
%71 = arith.extf %in : f16 to f32 | |
%72 = arith.extf %in_8 : f16 to f32 | |
%73 = arith.mulf %71, %72 : f32 | |
%74 = arith.addf %73, %out : f32 | |
linalg.yield %74 : f32 | |
} -> tensor<64x64xf32> | |
%44 = vector.transfer_read %43[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%46 = tensor.empty() : tensor<64x64xf32> | |
%47 = vector.transfer_write %45, %46[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%47 : tensor<64x64xf32>) outs(%arg1 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%71 = arith.maximumf %in, %out : f32 | |
linalg.yield %71 : f32 | |
} -> tensor<64xf32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%48, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_8: f32, %out: f32): | |
%71 = arith.subf %in_8, %in : f32 | |
%72 = math.exp2 %71 : f32 | |
linalg.yield %72 : f32 | |
} -> tensor<64xf32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%49, %arg2 : tensor<64xf32>, tensor<64xf32>) outs(%17 : tensor<64xf32>) { | |
^bb0(%in: f32, %in_8: f32, %out: f32): | |
%71 = arith.mulf %in, %in_8 : f32 | |
linalg.yield %71 : f32 | |
} -> tensor<64xf32> | |
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%48, %47 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_8: f32, %out: f32): | |
%71 = arith.subf %in_8, %in : f32 | |
%72 = math.exp2 %71 : f32 | |
linalg.yield %72 : f32 | |
} -> tensor<64x64xf32> | |
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%51 : tensor<64x64xf32>) outs(%50 : tensor<64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%71 = arith.addf %in, %out : f32 | |
linalg.yield %71 : f32 | |
} -> tensor<64xf32> | |
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51 : tensor<64x64xf32>) outs(%24 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %out: f16): | |
%71 = arith.truncf %in : f32 to f16 | |
linalg.yield %71 : f16 | |
} -> tensor<64x64xf16> | |
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%49, %arg3 : tensor<64xf32>, tensor<64x64xf32>) outs(%15 : tensor<64x64xf32>) { | |
^bb0(%in: f32, %in_8: f32, %out: f32): | |
%71 = arith.mulf %in, %in_8 : f32 | |
linalg.yield %71 : f32 | |
} -> tensor<64x64xf32> | |
%55 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%56 = tensor.empty() : tensor<64x64xf16> | |
%57 = vector.transfer_write %55, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%58 = vector.transfer_read %53[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%60 = tensor.empty() : tensor<64x64xf16> | |
%61 = vector.transfer_write %59, %60[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%62 = vector.transfer_read %54[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%64 = tensor.empty() : tensor<64x64xf32> | |
%65 = vector.transfer_write %63, %64[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%57, %61 : tensor<64x64xf16>, tensor<64x64xf16>) outs(%65 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_8: f16, %out: f32): | |
%71 = arith.extf %in : f16 to f32 | |
%72 = arith.extf %in_8 : f16 to f32 | |
%73 = arith.mulf %71, %72 : f32 | |
%74 = arith.addf %73, %out : f32 | |
linalg.yield %74 : f32 | |
} -> tensor<64x64xf32> | |
%67 = vector.transfer_read %66[%c0, %c0], %cst_3 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%68 = iree_vector_ext.to_layout %67 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%69 = tensor.empty() : tensor<64x64xf32> | |
%70 = vector.transfer_write %68, %69[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %48, %52, %70 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%34#1, %34#2 : tensor<64xf32>, tensor<64x64xf32>) outs(%24 : tensor<64x64xf16>) { | |
^bb0(%in: f32, %in_4: f32, %out: f16): | |
%36 = arith.divf %cst_1, %in : f32 | |
%37 = arith.mulf %36, %in_4 : f32 | |
%38 = arith.truncf %37 : f32 to f16 | |
linalg.yield %38 : f16 | |
} -> tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %35 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_5 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%22 = tensor.empty() : tensor<64x64xf16> | |
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%26:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_6 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%35 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice_8 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%37 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%42 = vector.transfer_read %arg1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = vector.transfer_write %43, %arg1[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%45 = vector.transfer_read %arg1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%46 = arith.subf %45, %43 : vector<64xf32> | |
%47 = math.exp2 %46 : vector<64xf32> | |
%48 = vector.transfer_read %arg2[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%49 = arith.mulf %47, %48 : vector<64xf32> | |
%50 = vector.transfer_write %49, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%51 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%52 = vector.transpose %51, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%53 = arith.subf %41, %52 : vector<64x64xf32> | |
%54 = math.exp2 %53 : vector<64x64xf32> | |
%55 = vector.multi_reduction <add>, %54, %49 [1] : vector<64x64xf32> to vector<64xf32> | |
%56 = vector.transfer_write %55, %50[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%57 = arith.truncf %54 : vector<64x64xf32> to vector<64x64xf16> | |
%58 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32> | |
%59 = vector.transpose %58, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%60 = vector.transfer_read %arg3[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%61 = arith.mulf %59, %60 : vector<64x64xf32> | |
%62 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%63 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%64 = iree_vector_ext.to_layout %61 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %63, %64 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%66 = iree_vector_ext.to_layout %65 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%67 = tensor.empty() : tensor<64x64xf32> | |
%68 = vector.transfer_write %66, %67[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %44, %56, %68 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32> | |
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%30 = arith.divf %cst, %28 : vector<64x64xf32> | |
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%32 = arith.mulf %31, %29 : vector<64x64xf32> | |
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16> | |
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %34 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_5 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%22 = tensor.empty() : tensor<64x64xf16> | |
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%26:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_6 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%35 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice_8 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%37 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%42 = vector.transfer_read %arg1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = vector.transfer_write %43, %arg1[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%45 = vector.transfer_read %arg1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%46 = arith.subf %45, %43 : vector<64xf32> | |
%47 = math.exp2 %46 : vector<64xf32> | |
%48 = vector.transfer_read %arg2[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%49 = arith.mulf %47, %48 : vector<64xf32> | |
%50 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%51 = vector.transpose %50, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%52 = arith.subf %41, %51 : vector<64x64xf32> | |
%53 = math.exp2 %52 : vector<64x64xf32> | |
%54 = vector.multi_reduction <add>, %53, %49 [1] : vector<64x64xf32> to vector<64xf32> | |
%55 = vector.transfer_write %54, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%56 = arith.truncf %53 : vector<64x64xf32> to vector<64x64xf16> | |
%57 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32> | |
%58 = vector.transpose %57, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%59 = vector.transfer_read %arg3[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%60 = arith.mulf %58, %59 : vector<64x64xf32> | |
%61 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%62 = iree_vector_ext.to_layout %56 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%63 = iree_vector_ext.to_layout %60 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %61, %62, %63 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%65 = iree_vector_ext.to_layout %64 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%66 = tensor.empty() : tensor<64x64xf32> | |
%67 = vector.transfer_write %65, %66[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %44, %55, %67 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32> | |
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%30 = arith.divf %cst, %28 : vector<64x64xf32> | |
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%32 = arith.mulf %31, %29 : vector<64x64xf32> | |
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16> | |
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %34 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_5 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = tensor.empty() : tensor<64x64xf32> | |
%16 = vector.transfer_write %cst_3, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%17 = tensor.empty() : tensor<64xf32> | |
%18 = vector.transfer_write %cst_2, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%19 = vector.transfer_write %cst_1, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%extracted_slice = tensor.extract_slice %12[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%20 = vector.transfer_read %extracted_slice[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%22 = tensor.empty() : tensor<64x64xf16> | |
%23 = arith.mulf %21, %cst_0 : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%25 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%26:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %18, %arg2 = %19, %arg3 = %16) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32>) { | |
%extracted_slice_6 = tensor.extract_slice %13[0, 0, %arg0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x4096x64xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%35 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%36 = iree_vector_ext.to_layout %35 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice_8 = tensor.extract_slice %14[0, 0, 0, %arg0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x4096xf16> to tensor<1x1x64x64xf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xf16> to tensor<64x64xf16> | |
%37 = vector.transfer_read %extracted_slice_9[%c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x64xf16> | |
%38 = iree_vector_ext.to_layout %37 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%39 = iree_vector_ext.to_layout %36 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %39, %24, %25 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%42 = vector.transfer_read %arg1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%43 = vector.multi_reduction <maximumf>, %41, %42 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = vector.transfer_write %43, %arg1[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%45 = arith.subf %42, %43 : vector<64xf32> | |
%46 = math.exp2 %45 : vector<64xf32> | |
%47 = vector.transfer_read %arg2[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%48 = arith.mulf %46, %47 : vector<64xf32> | |
%49 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%51 = arith.subf %41, %50 : vector<64x64xf32> | |
%52 = math.exp2 %51 : vector<64x64xf32> | |
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32> | |
%54 = vector.transfer_write %53, %17[%c0] {in_bounds = [true]} : vector<64xf32>, tensor<64xf32> | |
%55 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16> | |
%56 = vector.broadcast %46 : vector<64xf32> to vector<64x64xf32> | |
%57 = vector.transpose %56, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%58 = vector.transfer_read %arg3[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%59 = arith.mulf %57, %58 : vector<64x64xf32> | |
%60 = iree_vector_ext.to_layout %38 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%62 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%65 = vector.transfer_write %64, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %44, %54, %65 : tensor<64xf32>, tensor<64xf32>, tensor<64x64xf32> | |
} | |
%27 = vector.transfer_read %26#1[%c0], %cst_5 {in_bounds = [true]} : tensor<64xf32>, vector<64xf32> | |
%28 = vector.broadcast %27 : vector<64xf32> to vector<64x64xf32> | |
%29 = vector.transfer_read %26#2[%c0, %c0], %cst_5 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%30 = arith.divf %cst, %28 : vector<64x64xf32> | |
%31 = vector.transpose %30, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%32 = arith.mulf %31, %29 : vector<64x64xf32> | |
%33 = arith.truncf %32 : vector<64x64xf32> to vector<64x64xf16> | |
%34 = vector.transfer_write %33, %22[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %34 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%21:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_1, %arg3 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
%28 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%30 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%35 = vector.multi_reduction <maximumf>, %34, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%36 = arith.subf %arg1, %35 : vector<64xf32> | |
%37 = math.exp2 %36 : vector<64xf32> | |
%38 = arith.mulf %37, %arg2 : vector<64xf32> | |
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32> | |
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%41 = arith.subf %34, %40 : vector<64x64xf32> | |
%42 = math.exp2 %41 : vector<64x64xf32> | |
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16> | |
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32> | |
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%47 = arith.mulf %46, %arg3 : vector<64x64xf32> | |
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32> | |
%23 = arith.divf %cst, %22 : vector<64x64xf32> | |
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%25 = arith.mulf %24, %21#2 : vector<64x64xf32> | |
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16> | |
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %27 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%21:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_1, %arg3 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
%28 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%30 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%35 = vector.multi_reduction <maximumf>, %34, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%36 = arith.subf %arg1, %35 : vector<64xf32> | |
%37 = math.exp2 %36 : vector<64xf32> | |
%38 = arith.mulf %37, %arg2 : vector<64xf32> | |
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32> | |
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%41 = arith.subf %34, %40 : vector<64x64xf32> | |
%42 = math.exp2 %41 : vector<64x64xf32> | |
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16> | |
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32> | |
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%47 = arith.mulf %46, %arg3 : vector<64x64xf32> | |
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32> | |
%23 = arith.divf %cst, %22 : vector<64x64xf32> | |
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%25 = arith.mulf %24, %21#2 : vector<64x64xf32> | |
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16> | |
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %27 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%20 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%21:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_1, %arg3 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
%28 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%29 = iree_vector_ext.to_layout %28 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%30 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%31 = iree_vector_ext.to_layout %30 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%32 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %32, %19, %20 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%35 = vector.multi_reduction <maximumf>, %34, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%36 = arith.subf %arg1, %35 : vector<64xf32> | |
%37 = math.exp2 %36 : vector<64xf32> | |
%38 = arith.mulf %37, %arg2 : vector<64xf32> | |
%39 = vector.broadcast %35 : vector<64xf32> to vector<64x64xf32> | |
%40 = vector.transpose %39, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%41 = arith.subf %34, %40 : vector<64x64xf32> | |
%42 = math.exp2 %41 : vector<64x64xf32> | |
%43 = vector.multi_reduction <add>, %42, %38 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.truncf %42 : vector<64x64xf32> to vector<64x64xf16> | |
%45 = vector.broadcast %37 : vector<64xf32> to vector<64x64xf32> | |
%46 = vector.transpose %45, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%47 = arith.mulf %46, %arg3 : vector<64x64xf32> | |
%48 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x64xf16> | |
%49 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%50 = iree_vector_ext.to_layout %47 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%52 = iree_vector_ext.to_layout %51 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %35, %43, %52 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%22 = vector.broadcast %21#1 : vector<64xf32> to vector<64x64xf32> | |
%23 = arith.divf %cst, %22 : vector<64x64xf32> | |
%24 = vector.transpose %23, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%25 = arith.mulf %24, %21#2 : vector<64x64xf32> | |
%26 = arith.truncf %25 : vector<64x64xf32> to vector<64x64xf16> | |
%27 = vector.transfer_write %26, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %27 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After GPUVectorAllocPass (iree-codegen-gpu-vector-alloc) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
gpu.barrier | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_0 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_5 = arith.constant 0 : index | |
%20 = vector.transfer_write %18, %19[%c0_5, %c0_5] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_6 = arith.constant 0 : index | |
%cst_7 = arith.constant 0.000000e+00 : f16 | |
%22 = vector.transfer_read %21[%c0_6, %c0_6], %cst_7 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_1, %arg3 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
gpu.barrier | |
%32 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%34 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst_4 {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_8 = arith.constant 0 : index | |
%37 = vector.transfer_write %33, %36[%c0_8, %c0_8] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_9 = arith.constant 0 : index | |
%cst_10 = arith.constant 0.000000e+00 : f16 | |
%39 = vector.transfer_read %38[%c0_9, %c0_9], %cst_10 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%43 = vector.multi_reduction <maximumf>, %42, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.subf %arg1, %43 : vector<64xf32> | |
%45 = math.exp2 %44 : vector<64xf32> | |
%46 = arith.mulf %45, %arg2 : vector<64xf32> | |
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%49 = arith.subf %42, %48 : vector<64x64xf32> | |
%50 = math.exp2 %49 : vector<64x64xf32> | |
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32> | |
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16> | |
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%55 = arith.mulf %54, %arg3 : vector<64x64xf32> | |
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_11 = arith.constant 0 : index | |
%57 = vector.transfer_write %35, %56[%c0_11, %c0_11] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%c0_12 = arith.constant 0 : index | |
%cst_13 = arith.constant 0.000000e+00 : f16 | |
%59 = vector.transfer_read %58[%c0_12, %c0_12], %cst_13 {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_3 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%32 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%34 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39 = vector.transfer_read %38[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%43 = vector.multi_reduction <maximumf>, %42, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.subf %arg1, %43 : vector<64xf32> | |
%45 = math.exp2 %44 : vector<64xf32> | |
%46 = arith.mulf %45, %arg2 : vector<64xf32> | |
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%49 = arith.subf %42, %48 : vector<64x64xf32> | |
%50 = math.exp2 %49 : vector<64x64xf32> | |
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32> | |
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16> | |
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%55 = arith.mulf %54, %arg3 : vector<64x64xf32> | |
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%57 = vector.transfer_write %35, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%59 = vector.transfer_read %58[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst_4, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_3 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%32 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%34 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = iree_gpu.value_barrier %37 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39 = vector.transfer_read %38[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%40 = iree_vector_ext.to_layout %39 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %40, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%43 = vector.multi_reduction <maximumf>, %42, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%44 = arith.subf %arg1, %43 : vector<64xf32> | |
%45 = math.exp2 %44 : vector<64xf32> | |
%46 = arith.mulf %45, %arg2 : vector<64xf32> | |
%47 = vector.broadcast %43 : vector<64xf32> to vector<64x64xf32> | |
%48 = vector.transpose %47, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%49 = arith.subf %42, %48 : vector<64x64xf32> | |
%50 = math.exp2 %49 : vector<64x64xf32> | |
%51 = vector.multi_reduction <add>, %50, %46 [1] : vector<64x64xf32> to vector<64xf32> | |
%52 = arith.truncf %50 : vector<64x64xf32> to vector<64x64xf16> | |
%53 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%54 = vector.transpose %53, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%55 = arith.mulf %54, %arg3 : vector<64x64xf32> | |
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%57 = vector.transfer_write %35, %56[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%58 = iree_gpu.value_barrier %57 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%59 = vector.transfer_read %58[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %59 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %52 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%62 = iree_vector_ext.to_layout %55 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%64 = iree_vector_ext.to_layout %63 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %43, %51, %64 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst_4, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_3 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%32 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%34 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39 = vector.transfer_write %35, %38[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%40:2 = iree_gpu.value_barrier %37, %39 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%41 = vector.transfer_read %40#0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%45 = vector.multi_reduction <maximumf>, %44, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%46 = arith.subf %arg1, %45 : vector<64xf32> | |
%47 = math.exp2 %46 : vector<64xf32> | |
%48 = arith.mulf %47, %arg2 : vector<64xf32> | |
%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%51 = arith.subf %44, %50 : vector<64x64xf32> | |
%52 = math.exp2 %51 : vector<64x64xf32> | |
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32> | |
%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16> | |
%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32> | |
%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%57 = arith.mulf %56, %arg3 : vector<64x64xf32> | |
%58 = vector.transfer_read %40#1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst_4, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %17[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice = tensor.extract_slice %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> to tensor<64x64xf16> | |
%17 = tensor.empty() : tensor<64x64xf16> | |
%18 = arith.mulf %16, %cst_3 : vector<64x64xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%32 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%33 = iree_vector_ext.to_layout %32 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%34 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%35 = iree_vector_ext.to_layout %34 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = vector.transfer_write %33, %36[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39 = vector.transfer_write %35, %38[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%40:2 = iree_gpu.value_barrier %37, %39 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%41 = vector.transfer_read %40#0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%42 = iree_vector_ext.to_layout %41 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%44 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%45 = vector.multi_reduction <maximumf>, %44, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%46 = arith.subf %arg1, %45 : vector<64xf32> | |
%47 = math.exp2 %46 : vector<64xf32> | |
%48 = arith.mulf %47, %arg2 : vector<64xf32> | |
%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32> | |
%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%51 = arith.subf %44, %50 : vector<64x64xf32> | |
%52 = math.exp2 %51 : vector<64x64xf32> | |
%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32> | |
%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16> | |
%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32> | |
%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%57 = arith.mulf %56, %arg3 : vector<64x64xf32> | |
%58 = vector.transfer_read %40#1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%59 = iree_vector_ext.to_layout %58 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %54 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%61 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%63 = iree_vector_ext.to_layout %62 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32> | |
%27 = arith.divf %cst_4, %26 : vector<64x64xf32> | |
%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%29 = arith.mulf %28, %25#2 : vector<64x64xf32> | |
%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16> | |
%31 = vector.transfer_write %30, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %31 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%11 = flow.dispatch.tensor.load %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<1x64x1x64xf16> | |
%12 = flow.dispatch.tensor.load %7, offsets = [0, %workgroup_id_z, %workgroup_id_y, %10, 0], sizes = [1, 1, 1, 64, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x64x64xf16> | |
%13 = flow.dispatch.tensor.load %7, offsets = [1, %workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 1, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<1x1x4096x64xf16> | |
%14 = flow.dispatch.tensor.load %8, offsets = [%workgroup_id_z, %workgroup_id_y, 0, 0], sizes = [1, 1, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<1x1x64x4096xf16> | |
%15 = vector.transfer_read %12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x64x64xf16>, vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%extracted_slice = tensor.extract_slice %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<1x64x1x64xf16> to tensor<64x64xf16> | |
%17 = arith.mulf %16, %cst_3 : vector<64x64xf16> | |
%18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%19 = vector.transfer_write %17, %18[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%20 = iree_gpu.value_barrier %19 : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%21 = vector.transfer_read %20[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%23 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%24:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%31 = vector.transfer_read %13[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x4096x64xf16>, vector<64x64xf16> | |
%32 = iree_vector_ext.to_layout %31 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%33 = vector.transfer_read %14[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : tensor<1x1x64x4096xf16>, vector<64x64xf16> | |
%34 = iree_vector_ext.to_layout %33 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%35 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%36 = vector.transfer_write %32, %35[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = vector.transfer_write %34, %37[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%39:2 = iree_gpu.value_barrier %36, %38 : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>> | |
%40 = vector.transfer_read %39#0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%41 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %41, %22, %23 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%43 = iree_vector_ext.to_layout %42 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%44 = vector.multi_reduction <maximumf>, %43, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%45 = arith.subf %arg1, %44 : vector<64xf32> | |
%46 = math.exp2 %45 : vector<64xf32> | |
%47 = arith.mulf %46, %arg2 : vector<64xf32> | |
%48 = vector.broadcast %44 : vector<64xf32> to vector<64x64xf32> | |
%49 = vector.transpose %48, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%50 = arith.subf %43, %49 : vector<64x64xf32> | |
%51 = math.exp2 %50 : vector<64x64xf32> | |
%52 = vector.multi_reduction <add>, %51, %47 [1] : vector<64x64xf32> to vector<64xf32> | |
%53 = arith.truncf %51 : vector<64x64xf32> to vector<64x64xf16> | |
%54 = vector.broadcast %46 : vector<64xf32> to vector<64x64xf32> | |
%55 = vector.transpose %54, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%56 = arith.mulf %55, %arg3 : vector<64x64xf32> | |
%57 = vector.transfer_read %39#1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%58 = iree_vector_ext.to_layout %57 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%59 = iree_vector_ext.to_layout %53 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%60 = iree_vector_ext.to_layout %56 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%62 = iree_vector_ext.to_layout %61 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %44, %52, %62 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%25 = vector.broadcast %24#1 : vector<64xf32> to vector<64x64xf32> | |
%26 = arith.divf %cst_4, %25 : vector<64x64xf32> | |
%27 = vector.transpose %26, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%28 = arith.mulf %27, %24#2 : vector<64x64xf32> | |
%29 = arith.truncf %28 : vector<64x64xf32> to vector<64x64xf16> | |
%30 = vector.transfer_write %29, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16> | |
%inserted_slice = tensor.insert_slice %30 into %11[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : tensor<64x64xf16> into tensor<1x64x1x64xf16> | |
flow.dispatch.tensor.store %inserted_slice, %9, offsets = [%workgroup_id_z, %10, %workgroup_id_y, 0], sizes = [1, 64, 1, 64], strides = [1, 1, 1, 1] : tensor<1x64x1x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_12[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_11[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_12[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_8, %subview_9 : memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_10 = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_10 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_12[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_11[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_12[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_8, %subview_9 : memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_10 = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_10 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_11[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_12[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_11[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_12[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_8, %subview_9 : memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_10 = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_10 : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_8, %subview_8 : memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_3 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_4 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_8 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_3 : vector<64x64xf16> | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_1, %arg2 = %cst_2, %arg3 = %cst_0) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_6[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_7[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%alloc_9 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %24, %alloc_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_4, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_0 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_4 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_5 = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_9 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_4 : vector<64x64xf16> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc_10[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_3, %arg3 = %cst_1) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %arg0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_8[%c0, %c0, %c0, %arg0], %cst {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
vector.transfer_write %24, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst_5, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.dealloc %alloc_0 : memref<64x64xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUCastTypeToFitMMAPass (iree-llvmgpu-cast-type-to-fit-mma) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32> | |
%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f16 | |
%c64 = arith.constant 64 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_5 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.transfer_read %subview_6[%c0, %c0, %c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%subview_9 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = arith.mulf %12, %cst_0 : vector<64x64xf16> | |
%alloc_10 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %13, %alloc_10[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%14 = vector.transfer_read %alloc_10[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%16 = iree_vector_ext.to_layout %cst_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_2, %arg2 = %cst_1, %arg3 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) { | |
gpu.barrier | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %arg0, %c0], %cst_4 {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
%25 = vector.transfer_read %subview_8[%c0, %c0, %c0, %arg0], %cst_4 {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x64xf16> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<64x64xf16> | |
vector.transfer_write %24, %alloc_5[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %26, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%27 = vector.transfer_read %alloc_5[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%28 = iree_vector_ext.to_layout %27 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %28, %15, %16 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%30 = iree_vector_ext.to_layout %29 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%31 = vector.multi_reduction <maximumf>, %30, %arg1 [1] : vector<64x64xf32> to vector<64xf32> | |
%32 = arith.subf %arg1, %31 : vector<64xf32> | |
%33 = math.exp2 %32 : vector<64xf32> | |
%34 = arith.mulf %33, %arg2 : vector<64xf32> | |
%35 = vector.broadcast %31 : vector<64xf32> to vector<64x64xf32> | |
%36 = vector.transpose %35, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%37 = arith.subf %30, %36 : vector<64x64xf32> | |
%38 = math.exp2 %37 : vector<64x64xf32> | |
%39 = vector.multi_reduction <add>, %38, %34 [1] : vector<64x64xf32> to vector<64xf32> | |
%40 = arith.truncf %38 : vector<64x64xf32> to vector<64x64xf16> | |
%41 = vector.broadcast %33 : vector<64xf32> to vector<64x64xf32> | |
%42 = vector.transpose %41, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%43 = arith.mulf %42, %arg3 : vector<64x64xf32> | |
%44 = vector.transfer_read %alloc[%c0, %c0], %cst_4 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16> | |
%45 = iree_vector_ext.to_layout %44 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%46 = iree_vector_ext.to_layout %40 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16> | |
%47 = iree_vector_ext.to_layout %43 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32> | |
%49 = iree_vector_ext.to_layout %48 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %31, %39, %49 : vector<64xf32>, vector<64xf32>, vector<64x64xf32> | |
} | |
%18 = vector.broadcast %17#1 : vector<64xf32> to vector<64x64xf32> | |
%19 = arith.divf %cst, %18 : vector<64x64xf32> | |
%20 = vector.transpose %19, [1, 0] : vector<64x64xf32> to vector<64x64xf32> | |
%21 = arith.mulf %20, %17#2 : vector<64x64xf32> | |
%22 = arith.truncf %21 : vector<64x64xf32> to vector<64x64xf16> | |
vector.transfer_write %22, %subview_9[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.dealloc %alloc_5 : memref<64x64xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4x2x1x1x4x1xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32> | |
%cst_1 = arith.constant dense<0xFF800000> : vector<2x1x1xf32> | |
%cst_2 = arith.constant dense<0.000000e+00> : vector<4x4x1x1x1x4xf16> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf16> | |
%cst_4 = arith.constant dense<0.000000e+00> : vector<4x1x1x1x1x8xf16> | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst_5 = arith.constant 0.000000e+00 : f16 | |
%cst_6 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf32> | |
%cst_7 = arith.constant dense<-3.40282347E+38> : vector<2x1x1xf32> | |
%cst_8 = arith.constant dense<0.000000e+00> : vector<2x1x1xf32> | |
%cst_9 = arith.constant dense<1.802980e-01> : vector<4x1x1x1x1x8xf16> | |
%cst_10 = arith.constant dense<1.000000e+00> : vector<4x2x1x1x4x1xf32> | |
%thread_id_z = gpu.thread_id z | |
%thread_id_y = gpu.thread_id y | |
%thread_id_x = gpu.thread_id x | |
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 128) : index | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_11 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6 = arith.index_castui %3 : i32 to index | |
%7:3 = util.assume.int | |
%4[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%5[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%6[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%7#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%7#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%10 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%7#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %10, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %10[%workgroup_id_z, %11, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_12 = memref.subview %8[0, %workgroup_id_z, %workgroup_id_y, %11, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_13 = memref.subview %8[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_14 = memref.subview %9[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%14 = vector.transfer_read %subview_12[%c0, %c0, %12, %13], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = vector.insert_strided_slice %14, %cst_4 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%16 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0] | |
%17 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%18 = vector.transfer_read %subview_12[%c0, %c0, %16, %17], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = vector.insert_strided_slice %18, %15 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%20 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0] | |
%21 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%22 = vector.transfer_read %subview_12[%c0, %c0, %20, %21], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%23 = vector.insert_strided_slice %22, %19 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0] | |
%25 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%26 = vector.transfer_read %subview_12[%c0, %c0, %24, %25], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%27 = vector.insert_strided_slice %26, %23 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%subview_15 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%28 = arith.mulf %27, %cst_9 : vector<4x1x1x1x1x8xf16> | |
%alloc_16 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%29 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0] | |
%30 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%31 = vector.extract %28[0, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %31, %alloc_16[%29, %30] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0] | |
%33 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%34 = vector.extract %28[1, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %34, %alloc_16[%32, %33] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%37 = vector.extract %28[2, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %37, %alloc_16[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0] | |
%39 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%40 = vector.extract %28[3, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %40, %alloc_16[%38, %39] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%41 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%42 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%43 = vector.transfer_read %alloc_16[%41, %42], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%44 = vector.insert_strided_slice %43, %cst_3 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%45 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%47 = vector.transfer_read %alloc_16[%45, %46], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%48 = vector.insert_strided_slice %47, %44 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%49 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%51 = vector.transfer_read %alloc_16[%49, %50], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %48 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%54 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%55 = vector.transfer_read %alloc_16[%53, %54], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%56 = vector.insert_strided_slice %55, %52 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%57 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%58 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%59 = vector.transfer_read %alloc_16[%57, %58], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%60 = vector.insert_strided_slice %59, %56 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%61 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%63 = vector.transfer_read %alloc_16[%61, %62], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %60 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%66 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%67 = vector.transfer_read %alloc_16[%65, %66], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%68 = vector.insert_strided_slice %67, %64 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%69 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%70 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%71 = vector.transfer_read %alloc_16[%69, %70], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%72 = vector.insert_strided_slice %71, %68 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x4x1x1x1x4xf16> | |
%73:3 = scf.for %arg0 = %c0 to %c4096 step %c64 iter_args(%arg1 = %cst_7, %arg2 = %cst_8, %arg3 = %cst_6) -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32>) { | |
gpu.barrier | |
%126 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16)>()[%arg0, %0] | |
%127 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%128 = vector.transfer_read %subview_13[%c0, %c0, %126, %127], %cst_5 {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%129 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 16)>()[%arg0, %0] | |
%130 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%131 = vector.transfer_read %subview_13[%c0, %c0, %129, %130], %cst_5 {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%132 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 32)>()[%arg0, %0] | |
%133 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%134 = vector.transfer_read %subview_13[%c0, %c0, %132, %133], %cst_5 {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%135 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 16) * 16 + 48)>()[%arg0, %0] | |
%136 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
%137 = vector.transfer_read %subview_13[%c0, %c0, %135, %136], %cst_5 {in_bounds = [true, true]} : memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%138 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0] | |
%139 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg0, %0] | |
%140 = vector.transfer_read %subview_14[%c0, %c0, %138, %139], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%141 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0] | |
%142 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg0, %0] | |
%143 = vector.transfer_read %subview_14[%c0, %c0, %141, %142], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%144 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0] | |
%145 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg0, %0] | |
%146 = vector.transfer_read %subview_14[%c0, %c0, %144, %145], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%147 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0] | |
%148 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg0, %0] | |
%149 = vector.transfer_read %subview_14[%c0, %c0, %147, %148], %cst_5 {in_bounds = [true, true]} : memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%150 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0] | |
%151 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %128, %alloc_11[%150, %151] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%152 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0] | |
%153 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %131, %alloc_11[%152, %153] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%154 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0] | |
%155 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %134, %alloc_11[%154, %155] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%156 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0] | |
%157 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %137, %alloc_11[%156, %157] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%158 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%0] | |
%159 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %140, %alloc[%158, %159] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%160 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%0] | |
%161 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %143, %alloc[%160, %161] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%162 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%0] | |
%163 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %146, %alloc[%162, %163] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%164 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%0] | |
%165 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %149, %alloc[%164, %165] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%166 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%167 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%168 = vector.transfer_read %alloc_11[%166, %167], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%169 = vector.insert_strided_slice %168, %cst_2 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%170 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%171 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%172 = vector.transfer_read %alloc_11[%170, %171], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%173 = vector.insert_strided_slice %172, %169 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%174 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%175 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%176 = vector.transfer_read %alloc_11[%174, %175], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%177 = vector.insert_strided_slice %176, %173 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%178 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%179 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%180 = vector.transfer_read %alloc_11[%178, %179], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%181 = vector.insert_strided_slice %180, %177 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%182 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%183 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%184 = vector.transfer_read %alloc_11[%182, %183], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%185 = vector.insert_strided_slice %184, %181 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%186 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%187 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%188 = vector.transfer_read %alloc_11[%186, %187], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%189 = vector.insert_strided_slice %188, %185 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%190 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%191 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%192 = vector.transfer_read %alloc_11[%190, %191], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%193 = vector.insert_strided_slice %192, %189 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%194 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%195 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%196 = vector.transfer_read %alloc_11[%194, %195], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%197 = vector.insert_strided_slice %196, %193 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%198 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%199 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%200 = vector.transfer_read %alloc_11[%198, %199], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%201 = vector.insert_strided_slice %200, %197 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%202 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%203 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%204 = vector.transfer_read %alloc_11[%202, %203], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%205 = vector.insert_strided_slice %204, %201 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%206 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%207 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%208 = vector.transfer_read %alloc_11[%206, %207], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%209 = vector.insert_strided_slice %208, %205 {offsets = [2, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%210 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%211 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%212 = vector.transfer_read %alloc_11[%210, %211], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%213 = vector.insert_strided_slice %212, %209 {offsets = [2, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%214 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%215 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%216 = vector.transfer_read %alloc_11[%214, %215], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%217 = vector.insert_strided_slice %216, %213 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%218 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%219 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%220 = vector.transfer_read %alloc_11[%218, %219], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%221 = vector.insert_strided_slice %220, %217 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%222 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%223 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%224 = vector.transfer_read %alloc_11[%222, %223], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%225 = vector.insert_strided_slice %224, %221 {offsets = [3, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%226 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%227 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%228 = vector.transfer_read %alloc_11[%226, %227], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%229 = vector.insert_strided_slice %228, %225 {offsets = [3, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%230 = vector.extract %cst_6[0, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%231 = vector.extract %229[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%232 = vector.extract %72[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%233 = vector.shape_cast %231 : vector<1x1x1x4xf16> to vector<4xf16> | |
%234 = vector.shape_cast %232 : vector<1x1x1x4xf16> to vector<4xf16> | |
%235 = vector.shape_cast %230 : vector<1x1x1x4xf32> to vector<4xf32> | |
%236 = amdgpu.mfma %233 * %234 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%237 = vector.extract %229[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%238 = vector.extract %72[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%239 = vector.shape_cast %237 : vector<1x1x1x4xf16> to vector<4xf16> | |
%240 = vector.shape_cast %238 : vector<1x1x1x4xf16> to vector<4xf16> | |
%241 = amdgpu.mfma %239 * %240 + %236 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%242 = vector.extract %229[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%243 = vector.extract %72[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%244 = vector.shape_cast %242 : vector<1x1x1x4xf16> to vector<4xf16> | |
%245 = vector.shape_cast %243 : vector<1x1x1x4xf16> to vector<4xf16> | |
%246 = amdgpu.mfma %244 * %245 + %241 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%247 = vector.extract %229[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%248 = vector.extract %72[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%249 = vector.shape_cast %247 : vector<1x1x1x4xf16> to vector<4xf16> | |
%250 = vector.shape_cast %248 : vector<1x1x1x4xf16> to vector<4xf16> | |
%251 = amdgpu.mfma %249 * %250 + %246 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%252 = vector.shape_cast %251 : vector<4xf32> to vector<1x1x1x4xf32> | |
%253 = vector.insert %252, %cst_6 [0, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%254 = vector.extract %cst_6[0, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%255 = vector.extract %229[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%256 = vector.extract %72[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%257 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16> | |
%258 = vector.shape_cast %256 : vector<1x1x1x4xf16> to vector<4xf16> | |
%259 = vector.shape_cast %254 : vector<1x1x1x4xf32> to vector<4xf32> | |
%260 = amdgpu.mfma %257 * %258 + %259 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%261 = vector.extract %229[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%262 = vector.extract %72[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%263 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16> | |
%264 = vector.shape_cast %262 : vector<1x1x1x4xf16> to vector<4xf16> | |
%265 = amdgpu.mfma %263 * %264 + %260 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%266 = vector.extract %229[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%267 = vector.extract %72[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%268 = vector.shape_cast %266 : vector<1x1x1x4xf16> to vector<4xf16> | |
%269 = vector.shape_cast %267 : vector<1x1x1x4xf16> to vector<4xf16> | |
%270 = amdgpu.mfma %268 * %269 + %265 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%271 = vector.extract %229[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%272 = vector.extract %72[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%273 = vector.shape_cast %271 : vector<1x1x1x4xf16> to vector<4xf16> | |
%274 = vector.shape_cast %272 : vector<1x1x1x4xf16> to vector<4xf16> | |
%275 = amdgpu.mfma %273 * %274 + %270 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%276 = vector.shape_cast %275 : vector<4xf32> to vector<1x1x1x4xf32> | |
%277 = vector.insert %276, %253 [0, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%278 = vector.extract %cst_6[0, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%279 = vector.extract %229[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%280 = vector.extract %72[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%281 = vector.shape_cast %279 : vector<1x1x1x4xf16> to vector<4xf16> | |
%282 = vector.shape_cast %280 : vector<1x1x1x4xf16> to vector<4xf16> | |
%283 = vector.shape_cast %278 : vector<1x1x1x4xf32> to vector<4xf32> | |
%284 = amdgpu.mfma %281 * %282 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%285 = vector.extract %229[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%286 = vector.extract %72[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%287 = vector.shape_cast %285 : vector<1x1x1x4xf16> to vector<4xf16> | |
%288 = vector.shape_cast %286 : vector<1x1x1x4xf16> to vector<4xf16> | |
%289 = amdgpu.mfma %287 * %288 + %284 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%290 = vector.extract %229[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%291 = vector.extract %72[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%292 = vector.shape_cast %290 : vector<1x1x1x4xf16> to vector<4xf16> | |
%293 = vector.shape_cast %291 : vector<1x1x1x4xf16> to vector<4xf16> | |
%294 = amdgpu.mfma %292 * %293 + %289 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%295 = vector.extract %229[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%296 = vector.extract %72[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%297 = vector.shape_cast %295 : vector<1x1x1x4xf16> to vector<4xf16> | |
%298 = vector.shape_cast %296 : vector<1x1x1x4xf16> to vector<4xf16> | |
%299 = amdgpu.mfma %297 * %298 + %294 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%300 = vector.shape_cast %299 : vector<4xf32> to vector<1x1x1x4xf32> | |
%301 = vector.insert %300, %277 [0, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%302 = vector.extract %cst_6[0, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%303 = vector.extract %229[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%304 = vector.extract %72[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%305 = vector.shape_cast %303 : vector<1x1x1x4xf16> to vector<4xf16> | |
%306 = vector.shape_cast %304 : vector<1x1x1x4xf16> to vector<4xf16> | |
%307 = vector.shape_cast %302 : vector<1x1x1x4xf32> to vector<4xf32> | |
%308 = amdgpu.mfma %305 * %306 + %307 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%309 = vector.extract %229[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%310 = vector.extract %72[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%311 = vector.shape_cast %309 : vector<1x1x1x4xf16> to vector<4xf16> | |
%312 = vector.shape_cast %310 : vector<1x1x1x4xf16> to vector<4xf16> | |
%313 = amdgpu.mfma %311 * %312 + %308 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%314 = vector.extract %229[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%315 = vector.extract %72[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%316 = vector.shape_cast %314 : vector<1x1x1x4xf16> to vector<4xf16> | |
%317 = vector.shape_cast %315 : vector<1x1x1x4xf16> to vector<4xf16> | |
%318 = amdgpu.mfma %316 * %317 + %313 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%319 = vector.extract %229[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%320 = vector.extract %72[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%321 = vector.shape_cast %319 : vector<1x1x1x4xf16> to vector<4xf16> | |
%322 = vector.shape_cast %320 : vector<1x1x1x4xf16> to vector<4xf16> | |
%323 = amdgpu.mfma %321 * %322 + %318 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%324 = vector.shape_cast %323 : vector<4xf32> to vector<1x1x1x4xf32> | |
%325 = vector.insert %324, %301 [0, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%326 = vector.extract %cst_6[1, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%327 = vector.extract %229[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%328 = vector.extract %72[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%329 = vector.shape_cast %327 : vector<1x1x1x4xf16> to vector<4xf16> | |
%330 = vector.shape_cast %328 : vector<1x1x1x4xf16> to vector<4xf16> | |
%331 = vector.shape_cast %326 : vector<1x1x1x4xf32> to vector<4xf32> | |
%332 = amdgpu.mfma %329 * %330 + %331 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%333 = vector.extract %229[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%334 = vector.extract %72[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%335 = vector.shape_cast %333 : vector<1x1x1x4xf16> to vector<4xf16> | |
%336 = vector.shape_cast %334 : vector<1x1x1x4xf16> to vector<4xf16> | |
%337 = amdgpu.mfma %335 * %336 + %332 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%338 = vector.extract %229[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%339 = vector.extract %72[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%340 = vector.shape_cast %338 : vector<1x1x1x4xf16> to vector<4xf16> | |
%341 = vector.shape_cast %339 : vector<1x1x1x4xf16> to vector<4xf16> | |
%342 = amdgpu.mfma %340 * %341 + %337 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%343 = vector.extract %229[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%344 = vector.extract %72[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%345 = vector.shape_cast %343 : vector<1x1x1x4xf16> to vector<4xf16> | |
%346 = vector.shape_cast %344 : vector<1x1x1x4xf16> to vector<4xf16> | |
%347 = amdgpu.mfma %345 * %346 + %342 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%348 = vector.shape_cast %347 : vector<4xf32> to vector<1x1x1x4xf32> | |
%349 = vector.insert %348, %325 [1, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%350 = vector.extract %cst_6[1, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%351 = vector.extract %229[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%352 = vector.extract %72[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%353 = vector.shape_cast %351 : vector<1x1x1x4xf16> to vector<4xf16> | |
%354 = vector.shape_cast %352 : vector<1x1x1x4xf16> to vector<4xf16> | |
%355 = vector.shape_cast %350 : vector<1x1x1x4xf32> to vector<4xf32> | |
%356 = amdgpu.mfma %353 * %354 + %355 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%357 = vector.extract %229[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%358 = vector.extract %72[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%359 = vector.shape_cast %357 : vector<1x1x1x4xf16> to vector<4xf16> | |
%360 = vector.shape_cast %358 : vector<1x1x1x4xf16> to vector<4xf16> | |
%361 = amdgpu.mfma %359 * %360 + %356 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%362 = vector.extract %229[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%363 = vector.extract %72[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%364 = vector.shape_cast %362 : vector<1x1x1x4xf16> to vector<4xf16> | |
%365 = vector.shape_cast %363 : vector<1x1x1x4xf16> to vector<4xf16> | |
%366 = amdgpu.mfma %364 * %365 + %361 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%367 = vector.extract %229[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%368 = vector.extract %72[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%369 = vector.shape_cast %367 : vector<1x1x1x4xf16> to vector<4xf16> | |
%370 = vector.shape_cast %368 : vector<1x1x1x4xf16> to vector<4xf16> | |
%371 = amdgpu.mfma %369 * %370 + %366 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%372 = vector.shape_cast %371 : vector<4xf32> to vector<1x1x1x4xf32> | |
%373 = vector.insert %372, %349 [1, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%374 = vector.extract %cst_6[1, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%375 = vector.extract %229[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%376 = vector.extract %72[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%377 = vector.shape_cast %375 : vector<1x1x1x4xf16> to vector<4xf16> | |
%378 = vector.shape_cast %376 : vector<1x1x1x4xf16> to vector<4xf16> | |
%379 = vector.shape_cast %374 : vector<1x1x1x4xf32> to vector<4xf32> | |
%380 = amdgpu.mfma %377 * %378 + %379 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%381 = vector.extract %229[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%382 = vector.extract %72[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%383 = vector.shape_cast %381 : vector<1x1x1x4xf16> to vector<4xf16> | |
%384 = vector.shape_cast %382 : vector<1x1x1x4xf16> to vector<4xf16> | |
%385 = amdgpu.mfma %383 * %384 + %380 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%386 = vector.extract %229[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%387 = vector.extract %72[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%388 = vector.shape_cast %386 : vector<1x1x1x4xf16> to vector<4xf16> | |
%389 = vector.shape_cast %387 : vector<1x1x1x4xf16> to vector<4xf16> | |
%390 = amdgpu.mfma %388 * %389 + %385 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%391 = vector.extract %229[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%392 = vector.extract %72[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%393 = vector.shape_cast %391 : vector<1x1x1x4xf16> to vector<4xf16> | |
%394 = vector.shape_cast %392 : vector<1x1x1x4xf16> to vector<4xf16> | |
%395 = amdgpu.mfma %393 * %394 + %390 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%396 = vector.shape_cast %395 : vector<4xf32> to vector<1x1x1x4xf32> | |
%397 = vector.insert %396, %373 [1, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%398 = vector.extract %cst_6[1, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%399 = vector.extract %229[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%400 = vector.extract %72[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%401 = vector.shape_cast %399 : vector<1x1x1x4xf16> to vector<4xf16> | |
%402 = vector.shape_cast %400 : vector<1x1x1x4xf16> to vector<4xf16> | |
%403 = vector.shape_cast %398 : vector<1x1x1x4xf32> to vector<4xf32> | |
%404 = amdgpu.mfma %401 * %402 + %403 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%405 = vector.extract %229[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%406 = vector.extract %72[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%407 = vector.shape_cast %405 : vector<1x1x1x4xf16> to vector<4xf16> | |
%408 = vector.shape_cast %406 : vector<1x1x1x4xf16> to vector<4xf16> | |
%409 = amdgpu.mfma %407 * %408 + %404 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%410 = vector.extract %229[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%411 = vector.extract %72[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%412 = vector.shape_cast %410 : vector<1x1x1x4xf16> to vector<4xf16> | |
%413 = vector.shape_cast %411 : vector<1x1x1x4xf16> to vector<4xf16> | |
%414 = amdgpu.mfma %412 * %413 + %409 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%415 = vector.extract %229[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%416 = vector.extract %72[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%417 = vector.shape_cast %415 : vector<1x1x1x4xf16> to vector<4xf16> | |
%418 = vector.shape_cast %416 : vector<1x1x1x4xf16> to vector<4xf16> | |
%419 = amdgpu.mfma %417 * %418 + %414 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%420 = vector.shape_cast %419 : vector<4xf32> to vector<1x1x1x4xf32> | |
%421 = vector.insert %420, %397 [1, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%422 = vector.multi_reduction <maximumf>, %421, %cst_1 [1, 3, 5] : vector<2x4x1x1x1x4xf32> to vector<2x1x1xf32> | |
%423 = vector.extract %422[0, 0, 0] : f32 from vector<2x1x1xf32> | |
%424 = gpu.subgroup_reduce maximumf %423 cluster(size = 4, stride = 16) : (f32) -> f32 | |
%425 = vector.insert %424, %cst_0 [0] : f32 into vector<2xf32> | |
%426 = vector.extract %422[1, 0, 0] : f32 from vector<2x1x1xf32> | |
%427 = gpu.subgroup_reduce maximumf %426 cluster(size = 4, stride = 16) : (f32) -> f32 | |
%428 = vector.insert %427, %425 [1] : f32 into vector<2xf32> | |
%429 = vector.shape_cast %428 : vector<2xf32> to vector<2x1x1xf32> | |
%430 = arith.maximumf %429, %arg1 : vector<2x1x1xf32> | |
%431 = arith.subf %arg1, %430 : vector<2x1x1xf32> | |
%432 = math.exp2 %431 : vector<2x1x1xf32> | |
%433 = arith.mulf %432, %arg2 : vector<2x1x1xf32> | |
%434 = vector.extract %430[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%435 = vector.broadcast %434 : vector<1xf32> to vector<4x1xf32> | |
%436 = vector.insert %435, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%437 = vector.extract %430[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%438 = vector.broadcast %437 : vector<1xf32> to vector<4x1xf32> | |
%439 = vector.insert %438, %436 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%440 = vector.extract %430[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%441 = vector.broadcast %440 : vector<1xf32> to vector<4x1xf32> | |
%442 = vector.insert %441, %439 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%443 = vector.extract %430[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%444 = vector.broadcast %443 : vector<1xf32> to vector<4x1xf32> | |
%445 = vector.insert %444, %442 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%446 = vector.extract %430[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%447 = vector.broadcast %446 : vector<1xf32> to vector<4x1xf32> | |
%448 = vector.insert %447, %445 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%449 = vector.extract %430[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%450 = vector.broadcast %449 : vector<1xf32> to vector<4x1xf32> | |
%451 = vector.insert %450, %448 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%452 = vector.extract %430[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%453 = vector.broadcast %452 : vector<1xf32> to vector<4x1xf32> | |
%454 = vector.insert %453, %451 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%455 = vector.extract %430[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%456 = vector.broadcast %455 : vector<1xf32> to vector<4x1xf32> | |
%457 = vector.insert %456, %454 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%458 = vector.transpose %457, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32> | |
%459 = arith.subf %421, %458 : vector<2x4x1x1x1x4xf32> | |
%460 = math.exp2 %459 : vector<2x4x1x1x1x4xf32> | |
%461 = vector.multi_reduction <add>, %460, %cst_8 [1, 3, 5] : vector<2x4x1x1x1x4xf32> to vector<2x1x1xf32> | |
%462 = vector.extract %461[0, 0, 0] : f32 from vector<2x1x1xf32> | |
%463 = gpu.subgroup_reduce add %462 cluster(size = 4, stride = 16) : (f32) -> f32 | |
%464 = vector.insert %463, %cst_0 [0] : f32 into vector<2xf32> | |
%465 = vector.extract %461[1, 0, 0] : f32 from vector<2x1x1xf32> | |
%466 = gpu.subgroup_reduce add %465 cluster(size = 4, stride = 16) : (f32) -> f32 | |
%467 = vector.insert %466, %464 [1] : f32 into vector<2xf32> | |
%468 = vector.shape_cast %467 : vector<2xf32> to vector<2x1x1xf32> | |
%469 = arith.addf %468, %433 : vector<2x1x1xf32> | |
%470 = arith.truncf %460 : vector<2x4x1x1x1x4xf32> to vector<2x4x1x1x1x4xf16> | |
%471 = vector.extract %432[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%472 = vector.broadcast %471 : vector<1xf32> to vector<4x1xf32> | |
%473 = vector.insert %472, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%474 = vector.extract %432[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%475 = vector.broadcast %474 : vector<1xf32> to vector<4x1xf32> | |
%476 = vector.insert %475, %473 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%477 = vector.extract %432[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%478 = vector.broadcast %477 : vector<1xf32> to vector<4x1xf32> | |
%479 = vector.insert %478, %476 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%480 = vector.extract %432[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%481 = vector.broadcast %480 : vector<1xf32> to vector<4x1xf32> | |
%482 = vector.insert %481, %479 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%483 = vector.extract %432[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%484 = vector.broadcast %483 : vector<1xf32> to vector<4x1xf32> | |
%485 = vector.insert %484, %482 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%486 = vector.extract %432[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%487 = vector.broadcast %486 : vector<1xf32> to vector<4x1xf32> | |
%488 = vector.insert %487, %485 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%489 = vector.extract %432[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%490 = vector.broadcast %489 : vector<1xf32> to vector<4x1xf32> | |
%491 = vector.insert %490, %488 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%492 = vector.extract %432[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%493 = vector.broadcast %492 : vector<1xf32> to vector<4x1xf32> | |
%494 = vector.insert %493, %491 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%495 = vector.transpose %494, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32> | |
%496 = arith.mulf %495, %arg3 : vector<2x4x1x1x1x4xf32> | |
%497 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%498 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%499 = vector.transfer_read %alloc[%497, %498], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%500 = vector.insert_strided_slice %499, %cst_2 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%501 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%502 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%503 = vector.transfer_read %alloc[%501, %502], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%504 = vector.insert_strided_slice %503, %500 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%505 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%506 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%507 = vector.transfer_read %alloc[%505, %506], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%508 = vector.insert_strided_slice %507, %504 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%509 = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%0] | |
%510 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%511 = vector.transfer_read %alloc[%509, %510], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%512 = vector.insert_strided_slice %511, %508 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%513 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%514 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%515 = vector.transfer_read %alloc[%513, %514], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%516 = vector.insert_strided_slice %515, %512 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%517 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%518 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%519 = vector.transfer_read %alloc[%517, %518], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%520 = vector.insert_strided_slice %519, %516 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%521 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%522 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%523 = vector.transfer_read %alloc[%521, %522], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%524 = vector.insert_strided_slice %523, %520 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%525 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%0] | |
%526 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%527 = vector.transfer_read %alloc[%525, %526], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%528 = vector.insert_strided_slice %527, %524 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%529 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%530 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%531 = vector.transfer_read %alloc[%529, %530], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%532 = vector.insert_strided_slice %531, %528 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%533 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%534 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%535 = vector.transfer_read %alloc[%533, %534], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%536 = vector.insert_strided_slice %535, %532 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%537 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%538 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%539 = vector.transfer_read %alloc[%537, %538], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%540 = vector.insert_strided_slice %539, %536 {offsets = [2, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%541 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 32)>()[%0] | |
%542 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%543 = vector.transfer_read %alloc[%541, %542], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%544 = vector.insert_strided_slice %543, %540 {offsets = [2, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%545 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%546 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%547 = vector.transfer_read %alloc[%545, %546], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%548 = vector.insert_strided_slice %547, %544 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%549 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%550 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%551 = vector.transfer_read %alloc[%549, %550], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%552 = vector.insert_strided_slice %551, %548 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%553 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%554 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%555 = vector.transfer_read %alloc[%553, %554], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%556 = vector.insert_strided_slice %555, %552 {offsets = [3, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%557 = affine.apply affine_map<()[s0] -> (s0 mod 16 + 48)>()[%0] | |
%558 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%559 = vector.transfer_read %alloc[%557, %558], %cst_5 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%560 = vector.insert_strided_slice %559, %556 {offsets = [3, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<4x4x1x1x1x4xf16> | |
%561 = vector.extract %496[0, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%562 = vector.extract %560[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%563 = vector.extract %470[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%564 = vector.shape_cast %562 : vector<1x1x1x4xf16> to vector<4xf16> | |
%565 = vector.shape_cast %563 : vector<1x1x1x4xf16> to vector<4xf16> | |
%566 = vector.shape_cast %561 : vector<1x1x1x4xf32> to vector<4xf32> | |
%567 = amdgpu.mfma %564 * %565 + %566 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%568 = vector.extract %560[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%569 = vector.extract %470[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%570 = vector.shape_cast %568 : vector<1x1x1x4xf16> to vector<4xf16> | |
%571 = vector.shape_cast %569 : vector<1x1x1x4xf16> to vector<4xf16> | |
%572 = amdgpu.mfma %570 * %571 + %567 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%573 = vector.extract %560[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%574 = vector.extract %470[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%575 = vector.shape_cast %573 : vector<1x1x1x4xf16> to vector<4xf16> | |
%576 = vector.shape_cast %574 : vector<1x1x1x4xf16> to vector<4xf16> | |
%577 = amdgpu.mfma %575 * %576 + %572 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%578 = vector.extract %560[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%579 = vector.extract %470[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%580 = vector.shape_cast %578 : vector<1x1x1x4xf16> to vector<4xf16> | |
%581 = vector.shape_cast %579 : vector<1x1x1x4xf16> to vector<4xf16> | |
%582 = amdgpu.mfma %580 * %581 + %577 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%583 = vector.shape_cast %582 : vector<4xf32> to vector<1x1x1x4xf32> | |
%584 = vector.insert %583, %cst_6 [0, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%585 = vector.extract %496[0, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%586 = vector.extract %560[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%587 = vector.extract %470[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%588 = vector.shape_cast %586 : vector<1x1x1x4xf16> to vector<4xf16> | |
%589 = vector.shape_cast %587 : vector<1x1x1x4xf16> to vector<4xf16> | |
%590 = vector.shape_cast %585 : vector<1x1x1x4xf32> to vector<4xf32> | |
%591 = amdgpu.mfma %588 * %589 + %590 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%592 = vector.extract %560[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%593 = vector.extract %470[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%594 = vector.shape_cast %592 : vector<1x1x1x4xf16> to vector<4xf16> | |
%595 = vector.shape_cast %593 : vector<1x1x1x4xf16> to vector<4xf16> | |
%596 = amdgpu.mfma %594 * %595 + %591 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%597 = vector.extract %560[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%598 = vector.extract %470[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%599 = vector.shape_cast %597 : vector<1x1x1x4xf16> to vector<4xf16> | |
%600 = vector.shape_cast %598 : vector<1x1x1x4xf16> to vector<4xf16> | |
%601 = amdgpu.mfma %599 * %600 + %596 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%602 = vector.extract %560[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%603 = vector.extract %470[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%604 = vector.shape_cast %602 : vector<1x1x1x4xf16> to vector<4xf16> | |
%605 = vector.shape_cast %603 : vector<1x1x1x4xf16> to vector<4xf16> | |
%606 = amdgpu.mfma %604 * %605 + %601 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%607 = vector.shape_cast %606 : vector<4xf32> to vector<1x1x1x4xf32> | |
%608 = vector.insert %607, %584 [0, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%609 = vector.extract %496[0, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%610 = vector.extract %560[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%611 = vector.extract %470[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%612 = vector.shape_cast %610 : vector<1x1x1x4xf16> to vector<4xf16> | |
%613 = vector.shape_cast %611 : vector<1x1x1x4xf16> to vector<4xf16> | |
%614 = vector.shape_cast %609 : vector<1x1x1x4xf32> to vector<4xf32> | |
%615 = amdgpu.mfma %612 * %613 + %614 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%616 = vector.extract %560[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%617 = vector.extract %470[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%618 = vector.shape_cast %616 : vector<1x1x1x4xf16> to vector<4xf16> | |
%619 = vector.shape_cast %617 : vector<1x1x1x4xf16> to vector<4xf16> | |
%620 = amdgpu.mfma %618 * %619 + %615 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%621 = vector.extract %560[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%622 = vector.extract %470[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%623 = vector.shape_cast %621 : vector<1x1x1x4xf16> to vector<4xf16> | |
%624 = vector.shape_cast %622 : vector<1x1x1x4xf16> to vector<4xf16> | |
%625 = amdgpu.mfma %623 * %624 + %620 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%626 = vector.extract %560[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%627 = vector.extract %470[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%628 = vector.shape_cast %626 : vector<1x1x1x4xf16> to vector<4xf16> | |
%629 = vector.shape_cast %627 : vector<1x1x1x4xf16> to vector<4xf16> | |
%630 = amdgpu.mfma %628 * %629 + %625 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%631 = vector.shape_cast %630 : vector<4xf32> to vector<1x1x1x4xf32> | |
%632 = vector.insert %631, %608 [0, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%633 = vector.extract %496[0, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%634 = vector.extract %560[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%635 = vector.extract %470[0, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%636 = vector.shape_cast %634 : vector<1x1x1x4xf16> to vector<4xf16> | |
%637 = vector.shape_cast %635 : vector<1x1x1x4xf16> to vector<4xf16> | |
%638 = vector.shape_cast %633 : vector<1x1x1x4xf32> to vector<4xf32> | |
%639 = amdgpu.mfma %636 * %637 + %638 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%640 = vector.extract %560[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%641 = vector.extract %470[0, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%642 = vector.shape_cast %640 : vector<1x1x1x4xf16> to vector<4xf16> | |
%643 = vector.shape_cast %641 : vector<1x1x1x4xf16> to vector<4xf16> | |
%644 = amdgpu.mfma %642 * %643 + %639 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%645 = vector.extract %560[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%646 = vector.extract %470[0, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%647 = vector.shape_cast %645 : vector<1x1x1x4xf16> to vector<4xf16> | |
%648 = vector.shape_cast %646 : vector<1x1x1x4xf16> to vector<4xf16> | |
%649 = amdgpu.mfma %647 * %648 + %644 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%650 = vector.extract %560[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%651 = vector.extract %470[0, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%652 = vector.shape_cast %650 : vector<1x1x1x4xf16> to vector<4xf16> | |
%653 = vector.shape_cast %651 : vector<1x1x1x4xf16> to vector<4xf16> | |
%654 = amdgpu.mfma %652 * %653 + %649 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%655 = vector.shape_cast %654 : vector<4xf32> to vector<1x1x1x4xf32> | |
%656 = vector.insert %655, %632 [0, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%657 = vector.extract %496[1, 0] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%658 = vector.extract %560[0, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%659 = vector.extract %470[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%660 = vector.shape_cast %658 : vector<1x1x1x4xf16> to vector<4xf16> | |
%661 = vector.shape_cast %659 : vector<1x1x1x4xf16> to vector<4xf16> | |
%662 = vector.shape_cast %657 : vector<1x1x1x4xf32> to vector<4xf32> | |
%663 = amdgpu.mfma %660 * %661 + %662 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%664 = vector.extract %560[0, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%665 = vector.extract %470[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%666 = vector.shape_cast %664 : vector<1x1x1x4xf16> to vector<4xf16> | |
%667 = vector.shape_cast %665 : vector<1x1x1x4xf16> to vector<4xf16> | |
%668 = amdgpu.mfma %666 * %667 + %663 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%669 = vector.extract %560[0, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%670 = vector.extract %470[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%671 = vector.shape_cast %669 : vector<1x1x1x4xf16> to vector<4xf16> | |
%672 = vector.shape_cast %670 : vector<1x1x1x4xf16> to vector<4xf16> | |
%673 = amdgpu.mfma %671 * %672 + %668 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%674 = vector.extract %560[0, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%675 = vector.extract %470[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%676 = vector.shape_cast %674 : vector<1x1x1x4xf16> to vector<4xf16> | |
%677 = vector.shape_cast %675 : vector<1x1x1x4xf16> to vector<4xf16> | |
%678 = amdgpu.mfma %676 * %677 + %673 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%679 = vector.shape_cast %678 : vector<4xf32> to vector<1x1x1x4xf32> | |
%680 = vector.insert %679, %656 [1, 0] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%681 = vector.extract %496[1, 1] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%682 = vector.extract %560[1, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%683 = vector.extract %470[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%684 = vector.shape_cast %682 : vector<1x1x1x4xf16> to vector<4xf16> | |
%685 = vector.shape_cast %683 : vector<1x1x1x4xf16> to vector<4xf16> | |
%686 = vector.shape_cast %681 : vector<1x1x1x4xf32> to vector<4xf32> | |
%687 = amdgpu.mfma %684 * %685 + %686 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%688 = vector.extract %560[1, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%689 = vector.extract %470[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%690 = vector.shape_cast %688 : vector<1x1x1x4xf16> to vector<4xf16> | |
%691 = vector.shape_cast %689 : vector<1x1x1x4xf16> to vector<4xf16> | |
%692 = amdgpu.mfma %690 * %691 + %687 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%693 = vector.extract %560[1, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%694 = vector.extract %470[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%695 = vector.shape_cast %693 : vector<1x1x1x4xf16> to vector<4xf16> | |
%696 = vector.shape_cast %694 : vector<1x1x1x4xf16> to vector<4xf16> | |
%697 = amdgpu.mfma %695 * %696 + %692 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%698 = vector.extract %560[1, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%699 = vector.extract %470[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%700 = vector.shape_cast %698 : vector<1x1x1x4xf16> to vector<4xf16> | |
%701 = vector.shape_cast %699 : vector<1x1x1x4xf16> to vector<4xf16> | |
%702 = amdgpu.mfma %700 * %701 + %697 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%703 = vector.shape_cast %702 : vector<4xf32> to vector<1x1x1x4xf32> | |
%704 = vector.insert %703, %680 [1, 1] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%705 = vector.extract %496[1, 2] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%706 = vector.extract %560[2, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%707 = vector.extract %470[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%708 = vector.shape_cast %706 : vector<1x1x1x4xf16> to vector<4xf16> | |
%709 = vector.shape_cast %707 : vector<1x1x1x4xf16> to vector<4xf16> | |
%710 = vector.shape_cast %705 : vector<1x1x1x4xf32> to vector<4xf32> | |
%711 = amdgpu.mfma %708 * %709 + %710 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%712 = vector.extract %560[2, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%713 = vector.extract %470[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%714 = vector.shape_cast %712 : vector<1x1x1x4xf16> to vector<4xf16> | |
%715 = vector.shape_cast %713 : vector<1x1x1x4xf16> to vector<4xf16> | |
%716 = amdgpu.mfma %714 * %715 + %711 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%717 = vector.extract %560[2, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%718 = vector.extract %470[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%719 = vector.shape_cast %717 : vector<1x1x1x4xf16> to vector<4xf16> | |
%720 = vector.shape_cast %718 : vector<1x1x1x4xf16> to vector<4xf16> | |
%721 = amdgpu.mfma %719 * %720 + %716 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%722 = vector.extract %560[2, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%723 = vector.extract %470[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%724 = vector.shape_cast %722 : vector<1x1x1x4xf16> to vector<4xf16> | |
%725 = vector.shape_cast %723 : vector<1x1x1x4xf16> to vector<4xf16> | |
%726 = amdgpu.mfma %724 * %725 + %721 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%727 = vector.shape_cast %726 : vector<4xf32> to vector<1x1x1x4xf32> | |
%728 = vector.insert %727, %704 [1, 2] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
%729 = vector.extract %496[1, 3] : vector<1x1x1x4xf32> from vector<2x4x1x1x1x4xf32> | |
%730 = vector.extract %560[3, 0] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%731 = vector.extract %470[1, 0] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%732 = vector.shape_cast %730 : vector<1x1x1x4xf16> to vector<4xf16> | |
%733 = vector.shape_cast %731 : vector<1x1x1x4xf16> to vector<4xf16> | |
%734 = vector.shape_cast %729 : vector<1x1x1x4xf32> to vector<4xf32> | |
%735 = amdgpu.mfma %732 * %733 + %734 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%736 = vector.extract %560[3, 1] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%737 = vector.extract %470[1, 1] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%738 = vector.shape_cast %736 : vector<1x1x1x4xf16> to vector<4xf16> | |
%739 = vector.shape_cast %737 : vector<1x1x1x4xf16> to vector<4xf16> | |
%740 = amdgpu.mfma %738 * %739 + %735 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%741 = vector.extract %560[3, 2] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%742 = vector.extract %470[1, 2] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%743 = vector.shape_cast %741 : vector<1x1x1x4xf16> to vector<4xf16> | |
%744 = vector.shape_cast %742 : vector<1x1x1x4xf16> to vector<4xf16> | |
%745 = amdgpu.mfma %743 * %744 + %740 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%746 = vector.extract %560[3, 3] : vector<1x1x1x4xf16> from vector<4x4x1x1x1x4xf16> | |
%747 = vector.extract %470[1, 3] : vector<1x1x1x4xf16> from vector<2x4x1x1x1x4xf16> | |
%748 = vector.shape_cast %746 : vector<1x1x1x4xf16> to vector<4xf16> | |
%749 = vector.shape_cast %747 : vector<1x1x1x4xf16> to vector<4xf16> | |
%750 = amdgpu.mfma %748 * %749 + %745 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%751 = vector.shape_cast %750 : vector<4xf32> to vector<1x1x1x4xf32> | |
%752 = vector.insert %751, %728 [1, 3] : vector<1x1x1x4xf32> into vector<2x4x1x1x1x4xf32> | |
scf.yield %430, %469, %752 : vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32> | |
} | |
%74 = vector.extract %73#1[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%75 = vector.broadcast %74 : vector<1xf32> to vector<4x1xf32> | |
%76 = vector.insert %75, %cst [0, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%77 = vector.extract %73#1[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%78 = vector.broadcast %77 : vector<1xf32> to vector<4x1xf32> | |
%79 = vector.insert %78, %76 [0, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%80 = vector.extract %73#1[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%81 = vector.broadcast %80 : vector<1xf32> to vector<4x1xf32> | |
%82 = vector.insert %81, %79 [1, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%83 = vector.extract %73#1[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%84 = vector.broadcast %83 : vector<1xf32> to vector<4x1xf32> | |
%85 = vector.insert %84, %82 [1, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%86 = vector.extract %73#1[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%87 = vector.broadcast %86 : vector<1xf32> to vector<4x1xf32> | |
%88 = vector.insert %87, %85 [2, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%89 = vector.extract %73#1[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%90 = vector.broadcast %89 : vector<1xf32> to vector<4x1xf32> | |
%91 = vector.insert %90, %88 [2, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%92 = vector.extract %73#1[0, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%93 = vector.broadcast %92 : vector<1xf32> to vector<4x1xf32> | |
%94 = vector.insert %93, %91 [3, 0, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%95 = vector.extract %73#1[1, 0] : vector<1xf32> from vector<2x1x1xf32> | |
%96 = vector.broadcast %95 : vector<1xf32> to vector<4x1xf32> | |
%97 = vector.insert %96, %94 [3, 1, 0, 0] : vector<4x1xf32> into vector<4x2x1x1x4x1xf32> | |
%98 = arith.divf %cst_10, %97 : vector<4x2x1x1x4x1xf32> | |
%99 = vector.transpose %98, [1, 0, 3, 2, 5, 4] : vector<4x2x1x1x4x1xf32> to vector<2x4x1x1x1x4xf32> | |
%100 = arith.mulf %99, %73#2 : vector<2x4x1x1x1x4xf32> | |
%101 = arith.truncf %100 : vector<2x4x1x1x1x4xf32> to vector<2x4x1x1x1x4xf16> | |
%102 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%103 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%104 = vector.extract %101[0, 0, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %104, %subview_15[%102, %103] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%105 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%106 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%107 = vector.extract %101[0, 1, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %107, %subview_15[%105, %106] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%108 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%109 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%110 = vector.extract %101[0, 2, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %110, %subview_15[%108, %109] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%111 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%112 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%113 = vector.extract %101[0, 3, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %113, %subview_15[%111, %112] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%114 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%115 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%116 = vector.extract %101[1, 0, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %116, %subview_15[%114, %115] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%117 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%118 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%119 = vector.extract %101[1, 1, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %119, %subview_15[%117, %118] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%120 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%121 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%122 = vector.extract %101[1, 2, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %122, %subview_15[%120, %121] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%123 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%124 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%125 = vector.extract %101[1, 3, 0, 0] : vector<1x4xf16> from vector<2x4x1x1x1x4xf16> | |
vector.transfer_write %125, %subview_15[%123, %124] {in_bounds = [true, true]} : vector<1x4xf16>, memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.dealloc %alloc_11 : memref<64x64xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<64x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf32> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<4x2x1x1x4x1xf32> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32> | |
%cst_2 = arith.constant dense<0xFF800000> : vector<2x1x1xf32> | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<4x4x1x1x1x4xf16> | |
%cst_4 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf16> | |
%cst_5 = arith.constant dense<0.000000e+00> : vector<4x1x1x1x1x8xf16> | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c64 = arith.constant 64 : index | |
%cst_6 = arith.constant 0.000000e+00 : f16 | |
%cst_7 = arith.constant dense<0.000000e+00> : vector<2x4x1x1x1x4xf32> | |
%cst_8 = arith.constant dense<-3.40282347E+38> : vector<2x1x1xf32> | |
%cst_9 = arith.constant dense<0.000000e+00> : vector<2x1x1xf32> | |
%cst_10 = arith.constant dense<1.802980e-01> : vector<4x1x1x1x1x8xf16> | |
%cst_11 = arith.constant dense<1.000000e+00> : vector<4x2x1x1x4x1xf32> | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%alloc_12 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%0 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = arith.index_castui %0 : i32 to index | |
%4 = arith.index_castui %1 : i32 to index | |
%5 = arith.index_castui %2 : i32 to index | |
%6:3 = util.assume.int | |
%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], | |
%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], | |
%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>] | |
: index, index, index | |
%7 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %7, 1 : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %8, 1 : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = hal.interface.binding.subspan layout(<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%6#2) flags(Indirect) : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %9, 1 : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %9[%workgroup_id_z, %10, %workgroup_id_y, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<2x4096x10x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_13 = memref.subview %7[0, %workgroup_id_z, %workgroup_id_y, %10, 0] [1, 1, 1, 64, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_14 = memref.subview %7[1, %workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 1, 4096, 64] [1, 1, 1, 1, 1] : memref<2x2x10x4096x64xf16, strided<[5242880, 2621440, 262144, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x4096x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_15 = memref.subview %8[%workgroup_id_z, %workgroup_id_y, 0, 0] [1, 1, 64, 4096] [1, 1, 1, 1] : memref<2x10x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x64x4096xf16, strided<[2621440, 262144, 4096, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%thread_id_x] | |
%12 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%13 = vector.transfer_read %subview_13[%c0, %c0, %11, %12], %cst_6 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%14 = vector.insert_strided_slice %13, %cst_5 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%15 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%thread_id_x] | |
%16 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%17 = vector.transfer_read %subview_13[%c0, %c0, %15, %16], %cst_6 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%18 = vector.insert_strided_slice %17, %14 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%19 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%thread_id_x] | |
%20 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%21 = vector.transfer_read %subview_13[%c0, %c0, %19, %20], %cst_6 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%22 = vector.insert_strided_slice %21, %18 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%23 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%thread_id_x] | |
%24 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%25 = vector.transfer_read %subview_13[%c0, %c0, %23, %24], %cst_6 {in_bounds = [true, true]} : memref<1x1x64x64xf16, strided<[5242880, 2621440, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = vector.insert_strided_slice %25, %22 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<4x1x1x1x1x8xf16> | |
%subview_16 = memref.subview %subview[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] : memref<1x64x1x64xf16, strided<[2621440, 640, 64, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[640, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%27 = arith.mulf %26, %cst_10 : vector<4x1x1x1x1x8xf16> | |
%alloc_17 = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>> | |
%28 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16)>()[%thread_id_x] | |
%29 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%30 = vector.extract %27[0, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %30, %alloc_17[%28, %29] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%31 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 16)>()[%thread_id_x] | |
%32 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%33 = vector.extract %27[1, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %33, %alloc_17[%31, %32] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 32)>()[%thread_id_x] | |
%35 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%36 = vector.extract %27[2, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %36, %alloc_17[%34, %35] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 16 + 48)>()[%thread_id_x] | |
%38 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
%39 = vector.extract %27[3, 0, 0, 0] : vector<1x8xf16> from vector<4x1x1x1x1x8xf16> | |
vector.transfer_write %39, %alloc_17[%37, %38] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_17[%40, %41], %cst_6 {in_bounds = [true, true]} : memref<64x64xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_4 {offsets = [0, 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment