Skip to content

Instantly share code, notes, and snippets.

@kuhar
Created March 7, 2023 00:49
Show Gist options
  • Save kuhar/4c417c2e4300062a1c555f0faecfc623 to your computer and use it in GitHub Desktop.
Save kuhar/4c417c2e4300062a1c555f0faecfc623 to your computer and use it in GitHub Desktop.
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
%3 = tensor.empty() : tensor<2x1xf32>
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
%3 = tensor.empty() : tensor<2x1xf32>
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
return
}
}
// -----// IR Dump After DecomposeSoftmax (iree-linalg-ext-decompose-softmax) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
%3 = tensor.empty() : tensor<2x1xf32>
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
return
}
// -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
%3 = tensor.empty() : tensor<2x1xf32>
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
return
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
hal.return %0, %arg1, %c1 : index, index, index
}
builtin.module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32>
%5 = tensor.empty() : tensor<1x1xf32>
%6 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%4 : tensor<1x?xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
%cast = tensor.cast %6 : tensor<1x1xf32> to tensor<1x?xf32>
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
flow.dispatch.tensor.store %cast, %1, offsets = [%7, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
}
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x?xf32>
%cast = tensor.cast %5 : tensor<1x?xf32> to tensor<1x1xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32>
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x?xf32>) outs(%cast : tensor<1x1xf32>) : tensor<1x1xf32>
%cast_0 = tensor.cast %7 : tensor<1x1xf32> to tensor<1x?xf32>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
flow.dispatch.tensor.store %cast_0, %1, offsets = [%8, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
%8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
flow.dispatch.tensor.store %7, %1, offsets = [%8, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
}
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
}
}
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.copy %subview, %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
return
}
// -----// IR Dump After SPIRVTileAndDistribute (iree-spirv-tile-and-distribute) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
}
}
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
}
}
return
}
// -----// IR Dump After GPUDistributeSharedMemoryCopy (iree-gpu-distribute-shared-memory-copy) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
}
}
return
}
}
// -----// IR Dump After LinalgExtToLoops (iree-linalg-ext-to-loops) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
scf.for %arg4 = %c0 to %c1 step %c1 {
scf.for %arg5 = %c0 to %c1 step %c1 {
%9 = arith.subi %c0, %arg4 : index
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
}
return
}
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
scf.for %arg4 = %c0 to %c1 step %c1 {
scf.for %arg5 = %c0 to %c1 step %c1 {
%9 = arith.subi %c0, %arg4 : index
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
}
return
}
// -----// IR Dump After LinalgLowerToLoops (convert-linalg-to-loops) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
%7 = gpu.thread_id y
%8 = gpu.block_dim y
scf.for %arg2 = %7 to %c1 step %8 {
scf.for %arg3 = %5 to %c1 step %6 {
scf.for %arg4 = %c0 to %c1 step %c1 {
scf.for %arg5 = %c0 to %c1 step %c1 {
%9 = arith.subi %c0, %arg4 : index
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
}
return
}
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After SPIRVLowerExecutableTarget (iree-spirv-lower-executable-target-pass) //----- //
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
hal.return %0, %arg1, %c1 : index, index, index
}
builtin.module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After PadDynamicAlloc (iree-codegen-pad-dynamic-alloc) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0]
%8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4]
%11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After ExpandOps (memref-expand) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
%5 = gpu.thread_id x
%6 = gpu.block_dim x
scf.for %arg2 = %5 to %c1 step %6 {
%7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0]
%8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4]
%11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg2 = %4 to %c1 step %5 {
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg2 = %4 to %c1 step %5 {
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After SPIRVVectorizeLoadStore (iree-spirv-vectorize-load-store) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %2 to %c1 step %3 {
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg2 = %4 to %c1 step %5 {
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After SPIRVBreakDownLargeVector (iree-spirv-breakdown-large-vector) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After ForOpCanonicalization (iree-codegen-canonicalize-scf-for) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
// -----// IR Dump After FlattenMemRefSubspan (iree-codegen-flatten-memref-subspan) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%c64 = arith.constant 64 : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%c64_0 = arith.constant 64 : index
%3 = arith.muli %workgroup_count_x, %c64_0 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%c-1 = arith.constant -1 : index
%8 = arith.muli %arg0, %c-1 : index
%9 = arith.addi %arg1, %8 : index
%c1_1 = arith.constant 1 : index
%10 = arith.addi %9, %c1_1 : index
memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%c64 = arith.constant 64 : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%c64_0 = arith.constant 64 : index
%3 = arith.muli %workgroup_count_x, %c64_0 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%c-1 = arith.constant -1 : index
%8 = arith.muli %arg0, %c-1 : index
%9 = arith.addi %arg1, %8 : index
%c1_1 = arith.constant 1 : index
%10 = arith.addi %9, %c1_1 : index
memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
}
}
}
return
}
}
// -----// IR Dump After SPIRVMapMemRefStorageClass (iree-spirv-map-memref-storage-class) //----- //
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
}
}
}
return
}
// -----// IR Dump After SPIRVEmulateI64 (iree-spirv-emulate-i64) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
}
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
}
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_reverse_dim0_dispatch_0() {
%c64 = arith.constant 64 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = arith.muli %workgroup_id_x, %c64 : index
%3 = arith.muli %workgroup_count_x, %c64 : index
%4 = gpu.thread_id x
%5 = gpu.block_dim x
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
scf.for %arg1 = %2 to %c1 step %3 {
scf.for %arg2 = %4 to %c1 step %5 {
%6 = arith.addi %arg0, %arg1 : index
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
%8 = arith.subi %arg1, %arg0 : index
%9 = arith.addi %8, %c1 : index
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
}
}
}
return
}
}
// -----// IR Dump After ConvertToSPIRV (iree-convert-to-spirv) //----- //
module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
%cst64_i32_2 = spirv.Constant 64 : i32
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%cst0_i32_3 = spirv.Constant 0 : i32
%cst0_i32_4 = spirv.Constant 0 : i32
%cst1_i32_5 = spirv.Constant 1 : i32
%21 = spirv.IMul %cst1_i32_5, %20 : i32
%22 = spirv.IAdd %cst0_i32_4, %21 : i32
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%24 = spirv.Load "StorageBuffer" %23 : f32
%25 = spirv.ISub %15, %12 : i32
%26 = spirv.IAdd %25, %cst1_i32 : i32
%cst0_i32_6 = spirv.Constant 0 : i32
%cst0_i32_7 = spirv.Constant 0 : i32
%cst1_i32_8 = spirv.Constant 1 : i32
%27 = spirv.IMul %cst1_i32_8, %26 : i32
%28 = spirv.IAdd %cst0_i32_7, %27 : i32
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %29, %24 : f32
%30 = spirv.IAdd %18, %cst64_i32_2 : i32
spirv.Branch ^bb1(%30 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
}
}
// -----// IR Dump After SPIRVUnifyAliasedResourcePass (spirv-unify-aliased-resource) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
%cst64_i32_2 = spirv.Constant 64 : i32
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%cst0_i32_3 = spirv.Constant 0 : i32
%cst0_i32_4 = spirv.Constant 0 : i32
%cst1_i32_5 = spirv.Constant 1 : i32
%21 = spirv.IMul %cst1_i32_5, %20 : i32
%22 = spirv.IAdd %cst0_i32_4, %21 : i32
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%24 = spirv.Load "StorageBuffer" %23 : f32
%25 = spirv.ISub %15, %12 : i32
%26 = spirv.IAdd %25, %cst1_i32 : i32
%cst0_i32_6 = spirv.Constant 0 : i32
%cst0_i32_7 = spirv.Constant 0 : i32
%cst1_i32_8 = spirv.Constant 1 : i32
%27 = spirv.IMul %cst1_i32_8, %26 : i32
%28 = spirv.IAdd %cst0_i32_7, %27 : i32
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %29, %24 : f32
%30 = spirv.IAdd %18, %cst64_i32_2 : i32
spirv.Branch ^bb1(%30 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
}
// -----// IR Dump After SPIRVLowerABIAttributesPass (spirv-lower-abi-attrs) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
%cst64_i32_2 = spirv.Constant 64 : i32
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%cst0_i32_3 = spirv.Constant 0 : i32
%cst0_i32_4 = spirv.Constant 0 : i32
%cst1_i32_5 = spirv.Constant 1 : i32
%21 = spirv.IMul %cst1_i32_5, %20 : i32
%22 = spirv.IAdd %cst0_i32_4, %21 : i32
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%24 = spirv.Load "StorageBuffer" %23 : f32
%25 = spirv.ISub %15, %12 : i32
%26 = spirv.IAdd %25, %cst1_i32 : i32
%cst0_i32_6 = spirv.Constant 0 : i32
%cst0_i32_7 = spirv.Constant 0 : i32
%cst1_i32_8 = spirv.Constant 1 : i32
%27 = spirv.IMul %cst1_i32_8, %26 : i32
%28 = spirv.IAdd %cst0_i32_7, %27 : i32
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %29, %24 : f32
%30 = spirv.IAdd %18, %cst64_i32_2 : i32
spirv.Branch ^bb1(%30 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After CSE (cse) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After SPIRVRewriteInsertsPass (spirv-rewrite-inserts) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After SPIRVCanonicalizeGLPass (spirv-canonicalize-gl) //----- //
spirv.module Logical GLSL450 {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After SPIRVUpdateVCEPass (spirv-update-vce) //----- //
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
hal.return %0, %arg1, %c1 : index, index, index
}
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) //----- //
hal.executable public @_reverse_dim0_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
hal.return %0, %arg1, %c1 : index, index, index
}
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
}
}
}
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>
#map = affine_map<()[s0] -> (s0 ceildiv 64)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<SPIRVBaseDistribute>
module {
hal.executable public @_reverse_dim0_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb {
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation, workgroup_size = [64 : index, 1 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg2]
hal.return %0, %arg1, %c1 : index, index, index
}
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spirv.func @_reverse_dim0_dispatch_0() "None" {
%cst64_i32 = spirv.Constant 64 : i32
%cst2_i32 = spirv.Constant 2 : i32
%cst1_i32 = spirv.Constant 1 : i32
%cst0_i32 = spirv.Constant 0 : i32
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
%8 = spirv.IMul %1, %cst64_i32 : i32
%9 = spirv.IMul %3, %cst64_i32 : i32
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
spirv.mlir.loop {
spirv.Branch ^bb1(%5 : i32)
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2
%13 = spirv.SLessThan %12, %cst2_i32 : i32
spirv.BranchConditional %13, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%8 : i32)
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2
%16 = spirv.SLessThan %15, %cst1_i32 : i32
spirv.BranchConditional %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
spirv.mlir.loop {
spirv.Branch ^bb1(%11 : i32)
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2
%19 = spirv.SLessThan %18, %cst1_i32 : i32
spirv.BranchConditional %19, ^bb2, ^bb3
^bb2: // pred: ^bb1
%20 = spirv.IAdd %12, %15 : i32
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%22 = spirv.Load "StorageBuffer" %21 : f32
%23 = spirv.ISub %15, %12 : i32
%24 = spirv.IAdd %23, %cst1_i32 : i32
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
spirv.Store "StorageBuffer" %25, %22 : f32
%26 = spirv.IAdd %18, %cst64_i32 : i32
spirv.Branch ^bb1(%26 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%17 = spirv.IAdd %15, %9 : i32
spirv.Branch ^bb1(%17 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
%14 = spirv.IAdd %12, %7 : i32
spirv.Branch ^bb1(%14 : i32)
^bb3: // pred: ^bb1
spirv.mlir.merge
}
spirv.Return
}
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment