Created
March 7, 2023 00:49
-
-
Save kuhar/4c417c2e4300062a1c555f0faecfc623 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32> | |
%3 = tensor.empty() : tensor<2x1xf32> | |
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32> | |
%3 = tensor.empty() : tensor<2x1xf32> | |
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After DecomposeSoftmax (iree-linalg-ext-decompose-softmax) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32> | |
%3 = tensor.empty() : tensor<2x1xf32> | |
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
return | |
} | |
// -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32> | |
%3 = tensor.empty() : tensor<2x1xf32> | |
%4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32> | |
flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
return | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> { | |
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%c1 = arith.constant 1 : index | |
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2] | |
hal.return %0, %arg1, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32> | |
%5 = tensor.empty() : tensor<1x1xf32> | |
%6 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%4 : tensor<1x?xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32> | |
%cast = tensor.cast %6 : tensor<1x1xf32> to tensor<1x?xf32> | |
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
flow.dispatch.tensor.store %cast, %1, offsets = [%7, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x?xf32> | |
%cast = tensor.cast %5 : tensor<1x?xf32> to tensor<1x1xf32> | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32> | |
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x?xf32>) outs(%cast : tensor<1x1xf32>) : tensor<1x1xf32> | |
%cast_0 = tensor.cast %7 : tensor<1x1xf32> to tensor<1x?xf32> | |
%8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
flow.dispatch.tensor.store %cast_0, %1, offsets = [%8, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32> | |
%8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
flow.dispatch.tensor.store %7, %1, offsets = [%8, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32> | |
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32> | |
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32> | |
%7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32> | |
flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
memref.copy %subview, %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
return | |
} | |
// -----// IR Dump After SPIRVTileAndDistribute (iree-spirv-tile-and-distribute) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After GPUDistributeSharedMemoryCopy (iree-gpu-distribute-shared-memory-copy) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
%cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After LinalgExtToLoops (iree-linalg-ext-to-loops) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
scf.for %arg4 = %c0 to %c1 step %c1 { | |
scf.for %arg5 = %c0 to %c1 step %c1 { | |
%9 = arith.subi %c0, %arg4 : index | |
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
scf.for %arg4 = %c0 to %c1 step %c1 { | |
scf.for %arg5 = %c0 to %c1 step %c1 { | |
%9 = arith.subi %c0, %arg4 : index | |
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgLowerToLoops (convert-linalg-to-loops) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
%7 = gpu.thread_id y | |
%8 = gpu.block_dim y | |
scf.for %arg2 = %7 to %c1 step %8 { | |
scf.for %arg3 = %5 to %c1 step %6 { | |
scf.for %arg4 = %c0 to %c1 step %c1 { | |
scf.for %arg5 = %c0 to %c1 step %c1 { | |
%9 = arith.subi %c0, %arg4 : index | |
%10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After SPIRVLowerExecutableTarget (iree-spirv-lower-executable-target-pass) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> { | |
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%c1 = arith.constant 1 : index | |
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2] | |
hal.return %0, %arg1, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After PadDynamicAlloc (iree-codegen-pad-dynamic-alloc) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0] | |
%8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1] | |
%9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4] | |
%11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1] | |
memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After ExpandOps (memref-expand) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
%5 = gpu.thread_id x | |
%6 = gpu.block_dim x | |
scf.for %arg2 = %5 to %c1 step %6 { | |
%7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0] | |
%8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1] | |
%9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4] | |
%11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1] | |
memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After SPIRVVectorizeLoadStore (iree-spirv-vectorize-load-store) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After SPIRVBreakDownLargeVector (iree-spirv-breakdown-large-vector) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After ForOpCanonicalization (iree-codegen-canonicalize-scf-for) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
%6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0] | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After FlattenMemRefSubspan (iree-codegen-flatten-memref-subspan) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1) | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0] | |
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1) | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0] | |
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1) | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0] | |
memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%c64 = arith.constant 64 : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%c64_0 = arith.constant 64 : index | |
%3 = arith.muli %workgroup_count_x, %c64_0 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%c-1 = arith.constant -1 : index | |
%8 = arith.muli %arg0, %c-1 : index | |
%9 = arith.addi %arg1, %8 : index | |
%c1_1 = arith.constant 1 : index | |
%10 = arith.addi %9, %c1_1 : index | |
memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After TosaToArith (tosa-to-arith) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%c64 = arith.constant 64 : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%c64_0 = arith.constant 64 : index | |
%3 = arith.muli %workgroup_count_x, %c64_0 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%c-1 = arith.constant -1 : index | |
%8 = arith.muli %arg0, %c-1 : index | |
%9 = arith.addi %arg1, %8 : index | |
%c1_1 = arith.constant 1 : index | |
%10 = arith.addi %9, %c1_1 : index | |
memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After SPIRVMapMemRefStorageClass (iree-spirv-map-memref-storage-class) //----- // | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After SPIRVEmulateI64 (iree-spirv-emulate-i64) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_reverse_dim0_dispatch_0() { | |
%c64 = arith.constant 64 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = arith.muli %workgroup_id_x, %c64 : index | |
%3 = arith.muli %workgroup_count_x, %c64 : index | |
%4 = gpu.thread_id x | |
%5 = gpu.block_dim x | |
scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y { | |
scf.for %arg1 = %2 to %c1 step %3 { | |
scf.for %arg2 = %4 to %c1 step %5 { | |
%6 = arith.addi %arg0, %arg1 : index | |
%7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%8 = arith.subi %arg1, %arg0 : index | |
%9 = arith.addi %8, %c1 : index | |
memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After ConvertToSPIRV (iree-convert-to-spirv) //----- // | |
module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} { | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
%cst64_i32_2 = spirv.Constant 64 : i32 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%cst0_i32_3 = spirv.Constant 0 : i32 | |
%cst0_i32_4 = spirv.Constant 0 : i32 | |
%cst1_i32_5 = spirv.Constant 1 : i32 | |
%21 = spirv.IMul %cst1_i32_5, %20 : i32 | |
%22 = spirv.IAdd %cst0_i32_4, %21 : i32 | |
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%24 = spirv.Load "StorageBuffer" %23 : f32 | |
%25 = spirv.ISub %15, %12 : i32 | |
%26 = spirv.IAdd %25, %cst1_i32 : i32 | |
%cst0_i32_6 = spirv.Constant 0 : i32 | |
%cst0_i32_7 = spirv.Constant 0 : i32 | |
%cst1_i32_8 = spirv.Constant 1 : i32 | |
%27 = spirv.IMul %cst1_i32_8, %26 : i32 | |
%28 = spirv.IAdd %cst0_i32_7, %27 : i32 | |
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %29, %24 : f32 | |
%30 = spirv.IAdd %18, %cst64_i32_2 : i32 | |
spirv.Branch ^bb1(%30 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
} | |
} | |
// -----// IR Dump After SPIRVUnifyAliasedResourcePass (spirv-unify-aliased-resource) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
%cst64_i32_2 = spirv.Constant 64 : i32 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%cst0_i32_3 = spirv.Constant 0 : i32 | |
%cst0_i32_4 = spirv.Constant 0 : i32 | |
%cst1_i32_5 = spirv.Constant 1 : i32 | |
%21 = spirv.IMul %cst1_i32_5, %20 : i32 | |
%22 = spirv.IAdd %cst0_i32_4, %21 : i32 | |
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%24 = spirv.Load "StorageBuffer" %23 : f32 | |
%25 = spirv.ISub %15, %12 : i32 | |
%26 = spirv.IAdd %25, %cst1_i32 : i32 | |
%cst0_i32_6 = spirv.Constant 0 : i32 | |
%cst0_i32_7 = spirv.Constant 0 : i32 | |
%cst1_i32_8 = spirv.Constant 1 : i32 | |
%27 = spirv.IMul %cst1_i32_8, %26 : i32 | |
%28 = spirv.IAdd %cst0_i32_7, %27 : i32 | |
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %29, %24 : f32 | |
%30 = spirv.IAdd %18, %cst64_i32_2 : i32 | |
spirv.Branch ^bb1(%30 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
} | |
// -----// IR Dump After SPIRVLowerABIAttributesPass (spirv-lower-abi-attrs) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
%cst64_i32_2 = spirv.Constant 64 : i32 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%cst0_i32_3 = spirv.Constant 0 : i32 | |
%cst0_i32_4 = spirv.Constant 0 : i32 | |
%cst1_i32_5 = spirv.Constant 1 : i32 | |
%21 = spirv.IMul %cst1_i32_5, %20 : i32 | |
%22 = spirv.IAdd %cst0_i32_4, %21 : i32 | |
%23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%24 = spirv.Load "StorageBuffer" %23 : f32 | |
%25 = spirv.ISub %15, %12 : i32 | |
%26 = spirv.IAdd %25, %cst1_i32 : i32 | |
%cst0_i32_6 = spirv.Constant 0 : i32 | |
%cst0_i32_7 = spirv.Constant 0 : i32 | |
%cst1_i32_8 = spirv.Constant 1 : i32 | |
%27 = spirv.IMul %cst1_i32_8, %26 : i32 | |
%28 = spirv.IAdd %cst0_i32_7, %27 : i32 | |
%29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %29, %24 : f32 | |
%30 = spirv.IAdd %18, %cst64_i32_2 : i32 | |
spirv.Branch ^bb1(%30 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After SPIRVRewriteInsertsPass (spirv-rewrite-inserts) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After SPIRVCanonicalizeGLPass (spirv-canonicalize-gl) //----- // | |
spirv.module Logical GLSL450 { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After SPIRVUpdateVCEPass (spirv-update-vce) //----- // | |
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> { | |
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%c1 = arith.constant 1 : index | |
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2] | |
hal.return %0, %arg1, %c1 : index, index, index | |
} | |
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} { | |
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) //----- // | |
hal.executable public @_reverse_dim0_dispatch_0 { | |
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> { | |
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%c1 = arith.constant 1 : index | |
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2] | |
hal.return %0, %arg1, %c1 : index, index, index | |
} | |
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} { | |
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
} | |
} | |
} | |
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]> | |
#translation = #iree_codegen.translation_info<SPIRVBaseDistribute> | |
module { | |
hal.executable public @_reverse_dim0_dispatch_0 { | |
hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb { | |
hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation, workgroup_size = [64 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%c1 = arith.constant 1 : index | |
%0 = affine.apply #map()[%arg2] | |
hal.return %0, %arg1, %c1 : index, index, index | |
} | |
builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} { | |
spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> { | |
spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input> | |
spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
spirv.func @_reverse_dim0_dispatch_0() "None" { | |
%cst64_i32 = spirv.Constant 64 : i32 | |
%cst2_i32 = spirv.Constant 2 : i32 | |
%cst1_i32 = spirv.Constant 1 : i32 | |
%cst0_i32 = spirv.Constant 0 : i32 | |
%__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer> | |
%__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input> | |
%0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32> | |
%__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input> | |
%2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32> | |
%4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32> | |
%5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32> | |
%6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32> | |
%7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32> | |
%8 = spirv.IMul %1, %cst64_i32 : i32 | |
%9 = spirv.IMul %3, %cst64_i32 : i32 | |
%__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input> | |
%10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32> | |
%11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32> | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%5 : i32) | |
^bb1(%12: i32): // 2 preds: ^bb0, ^bb2 | |
%13 = spirv.SLessThan %12, %cst2_i32 : i32 | |
spirv.BranchConditional %13, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%8 : i32) | |
^bb1(%15: i32): // 2 preds: ^bb0, ^bb2 | |
%16 = spirv.SLessThan %15, %cst1_i32 : i32 | |
spirv.BranchConditional %16, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
spirv.mlir.loop { | |
spirv.Branch ^bb1(%11 : i32) | |
^bb1(%18: i32): // 2 preds: ^bb0, ^bb2 | |
%19 = spirv.SLessThan %18, %cst1_i32 : i32 | |
spirv.BranchConditional %19, ^bb2, ^bb3 | |
^bb2: // pred: ^bb1 | |
%20 = spirv.IAdd %12, %15 : i32 | |
%21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
%22 = spirv.Load "StorageBuffer" %21 : f32 | |
%23 = spirv.ISub %15, %12 : i32 | |
%24 = spirv.IAdd %23, %cst1_i32 : i32 | |
%25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 | |
spirv.Store "StorageBuffer" %25, %22 : f32 | |
%26 = spirv.IAdd %18, %cst64_i32 : i32 | |
spirv.Branch ^bb1(%26 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%17 = spirv.IAdd %15, %9 : i32 | |
spirv.Branch ^bb1(%17 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
%14 = spirv.IAdd %12, %7 : i32 | |
spirv.Branch ^bb1(%14 : i32) | |
^bb3: // pred: ^bb1 | |
spirv.mlir.merge | |
} | |
spirv.Return | |
} | |
spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__ | |
spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1 | |
} | |
} | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment