kuhar · March 7, 2023 00:49
diff --git a/out.log b/out.log
 // -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
  flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
    flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    return
  }
 }

 // -----// IR Dump After DecomposeSoftmax (iree-linalg-ext-decompose-softmax) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
  flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  return
 }

 // -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<2x1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%2 : tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) : tensor<2x1xf32>
  flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  return
 }

 // -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
 hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
  hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
    hal.return %0, %arg1, %c1 : index, index, index
  }
  builtin.module {
    func.func @_reverse_dim0_dispatch_0() {
      %c2 = arith.constant 2 : index
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
        %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
        scf.for %arg1 = %2 to %c1 step %3 {
          %4 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32>
          %5 = tensor.empty() : tensor<1x1xf32>
          %6 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%4 : tensor<1x?xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
          %cast = tensor.cast %6 : tensor<1x1xf32> to tensor<1x?xf32>
          %7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
          flow.dispatch.tensor.store %cast, %1, offsets = [%7, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        }
      }
      return
    }
  }
 }

 // -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x?xf32>
      %cast = tensor.cast %5 : tensor<1x?xf32> to tensor<1x1xf32>
      %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, %c1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x?xf32>
      %7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x?xf32>) outs(%cast : tensor<1x1xf32>) : tensor<1x1xf32>
      %cast_0 = tensor.cast %7 : tensor<1x1xf32> to tensor<1x?xf32>
      %8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      flow.dispatch.tensor.store %cast_0, %1, offsets = [%8, %arg1], sizes = [1, %c1], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
        %8 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        flow.dispatch.tensor.store %7, %1, offsets = [%8, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
        flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      }
    }
    return
  }
 }

 // -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
        flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      }
    }
    return
  }
 }

 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %5 = flow.dispatch.tensor.load %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1xf32>> -> tensor<1x1xf32>
        %7 = iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%6 : tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) : tensor<1x1xf32>
        flow.dispatch.tensor.store %7, %1, offsets = [%4, %arg1], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      }
    }
    return
  }
 }

 // -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %4 to %c1 step %5 {
        %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        %subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    return
  }
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %4 to %c1 step %5 {
        %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        %subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    return
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %4 to %c1 step %5 {
      %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      %subview_1 = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview, %subview_1 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %4 to %c1 step %5 {
      %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.copy %subview, %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1xf32>>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %4 to %c1 step %5 {
      %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %2[%6, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    }
  }
  return
 }

 // -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.reverse {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    }
  }
  return
 }

 // -----// IR Dump After SPIRVTileAndDistribute (iree-spirv-tile-and-distribute) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          %cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          %cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After GPUDistributeSharedMemoryCopy (iree-gpu-distribute-shared-memory-copy) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          %cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        %7 = gpu.thread_id  y
        %8 = gpu.block_dim  y
        scf.for %arg2 = %7 to %c1 step %8 {
          scf.for %arg3 = %5 to %c1 step %6 {
            %cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
          }
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        %7 = gpu.thread_id  y
        %8 = gpu.block_dim  y
        scf.for %arg2 = %7 to %c1 step %8 {
          scf.for %arg3 = %5 to %c1 step %6 {
            %cast = memref.cast %subview_0 : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %cast_1 = memref.cast %subview : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            iree_linalg_ext.reverse {__internal_linalg_transform__ = "tile_reduction", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64], [1, 1]]>} dimensions(dense<0> : tensor<1xi64>) ins(%cast : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%cast_1 : memref<?x?xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
          }
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After LinalgExtToLoops (iree-linalg-ext-to-loops) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          scf.for %arg4 = %c0 to %c1 step %c1 {
            scf.for %arg5 = %c0 to %c1 step %c1 {
              %9 = arith.subi %c0, %arg4 : index
              %10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            }
          }
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          scf.for %arg4 = %c0 to %c1 step %c1 {
            scf.for %arg5 = %c0 to %c1 step %c1 {
              %9 = arith.subi %c0, %arg4 : index
              %10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            }
          }
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After LinalgLowerToLoops (convert-linalg-to-loops) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      %7 = gpu.thread_id  y
      %8 = gpu.block_dim  y
      scf.for %arg2 = %7 to %c1 step %8 {
        scf.for %arg3 = %5 to %c1 step %6 {
          scf.for %arg4 = %c0 to %c1 step %c1 {
            scf.for %arg5 = %c0 to %c1 step %c1 {
              %9 = arith.subi %c0, %arg4 : index
              %10 = memref.load %subview_0[%arg4, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              memref.store %10, %subview[%9, %arg5] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            }
          }
        }
      }
    }
  }
  return
 }

 // -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      scf.for %arg2 = %5 to %c1 step %6 {
        %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        scf.for %arg2 = %5 to %c1 step %6 {
          %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        scf.for %arg2 = %5 to %c1 step %6 {
          %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After SPIRVLowerExecutableTarget (iree-spirv-lower-executable-target-pass) //----- //
 hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
  hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
    hal.return %0, %arg1, %c1 : index, index, index
  }
  builtin.module {
    func.func @_reverse_dim0_dispatch_0() {
      %c2 = arith.constant 2 : index
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
        %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
        scf.for %arg1 = %2 to %c1 step %3 {
          %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
          %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %5 = gpu.thread_id  x
          %6 = gpu.block_dim  x
          scf.for %arg2 = %5 to %c1 step %6 {
            %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          }
        }
      }
      return
    }
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        scf.for %arg2 = %5 to %c1 step %6 {
          %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        scf.for %arg2 = %5 to %c1 step %6 {
          %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      scf.for %arg2 = %5 to %c1 step %6 {
        %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After PadDynamicAlloc (iree-codegen-pad-dynamic-alloc) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %subview = memref.subview %1[%4, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %0[%arg0, %arg1] [1, 1] [1, 1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      scf.for %arg2 = %5 to %c1 step %6 {
        %7 = memref.load %subview_0[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %subview[%c0, %c0] : memref<1x1xf32, strided<[1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
        %5 = gpu.thread_id  x
        %6 = gpu.block_dim  x
        scf.for %arg2 = %5 to %c1 step %6 {
          %7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0]
          %8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          %10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4]
          %11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After ExpandOps (memref-expand) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    scf.for %arg1 = %2 to %c1 step %3 {
      %4 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      %5 = gpu.thread_id  x
      %6 = gpu.block_dim  x
      scf.for %arg2 = %5 to %c1 step %6 {
        %7 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg0]
        %8 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %9 = memref.load %0[%7, %8] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        %10 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%4]
        %11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        memref.store %9, %1[%10, %11] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = gpu.thread_id  x
        %5 = gpu.block_dim  x
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          %7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
          memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = gpu.thread_id  x
        %5 = gpu.block_dim  x
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          %7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
          memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After SPIRVVectorizeLoadStore (iree-spirv-vectorize-load-store) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
      %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
      scf.for %arg1 = %2 to %c1 step %3 {
        %4 = gpu.thread_id  x
        %5 = gpu.block_dim  x
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          %7 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
          memref.store %6, %1[%7, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
  %4 = gpu.thread_id  x
  %5 = gpu.block_dim  x
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
    scf.for %arg1 = %2 to %c1 step %3 {
      scf.for %arg2 = %4 to %c1 step %5 {
        %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After SPIRVBreakDownLargeVector (iree-spirv-breakdown-large-vector) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
  %4 = gpu.thread_id  x
  %5 = gpu.block_dim  x
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
    scf.for %arg1 = %2 to %c1 step %3 {
      scf.for %arg2 = %4 to %c1 step %5 {
        %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After ForOpCanonicalization (iree-codegen-canonicalize-scf-for) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
  %4 = gpu.thread_id  x
  %5 = gpu.block_dim  x
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
    scf.for %arg1 = %2 to %c1 step %3 {
      scf.for %arg2 = %4 to %c1 step %5 {
        %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
          memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
  %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
  %4 = gpu.thread_id  x
  %5 = gpu.block_dim  x
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    %6 = affine.apply affine_map<()[s0] -> (-s0 + 1)>()[%arg0]
    scf.for %arg1 = %2 to %c1 step %3 {
      scf.for %arg2 = %4 to %c1 step %5 {
        %7 = memref.load %0[%arg0, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
        memref.store %7, %1[%6, %arg1] : memref<2x1xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After FlattenMemRefSubspan (iree-codegen-flatten-memref-subspan) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
          memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
          memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
    %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg0, %arg1)
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %8 = affine.apply affine_map<(d0)[s0] -> (d0 - s0 + 1)>(%arg1)[%arg0]
          memref.store %7, %1[%8] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %c64 = arith.constant 64 : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %c64_0 = arith.constant 64 : index
    %3 = arith.muli %workgroup_count_x, %c64_0 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %c-1 = arith.constant -1 : index
          %8 = arith.muli %arg0, %c-1 : index
          %9 = arith.addi %arg1, %8 : index
          %c1_1 = arith.constant 1 : index
          %10 = arith.addi %9, %c1_1 : index
          memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %c64 = arith.constant 64 : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %c64_0 = arith.constant 64 : index
    %3 = arith.muli %workgroup_count_x, %c64_0 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %c-1 = arith.constant -1 : index
          %8 = arith.muli %arg0, %c-1 : index
          %9 = arith.addi %arg1, %8 : index
          %c1_1 = arith.constant 1 : index
          %10 = arith.addi %9, %c1_1 : index
          memref.store %7, %1[%10] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %3 = arith.muli %workgroup_count_x, %c64 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %8 = arith.subi %arg1, %arg0 : index
          %9 = arith.addi %8, %c1 : index
          memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #hal.descriptor_type<storage_buffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %3 = arith.muli %workgroup_count_x, %c64 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
          %8 = arith.subi %arg1, %arg0 : index
          %9 = arith.addi %8, %c1 : index
          memref.store %7, %1[%9] : memref<?xf32, #hal.descriptor_type<storage_buffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After SPIRVMapMemRefStorageClass (iree-spirv-map-memref-storage-class) //----- //
 func.func @_reverse_dim0_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %2 = arith.muli %workgroup_id_x, %c64 : index
  %3 = arith.muli %workgroup_count_x, %c64 : index
  %4 = gpu.thread_id  x
  %5 = gpu.block_dim  x
  scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
    scf.for %arg1 = %2 to %c1 step %3 {
      scf.for %arg2 = %4 to %c1 step %5 {
        %6 = arith.addi %arg0, %arg1 : index
        %7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %8 = arith.subi %arg1, %arg0 : index
        %9 = arith.addi %8, %c1 : index
        memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
      }
    }
  }
  return
 }

 // -----// IR Dump After SPIRVEmulateI64 (iree-spirv-emulate-i64) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %3 = arith.muli %workgroup_count_x, %c64 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
          %8 = arith.subi %arg1, %arg0 : index
          %9 = arith.addi %8, %c1 : index
          memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %3 = arith.muli %workgroup_count_x, %c64 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
          %8 = arith.subi %arg1, %arg0 : index
          %9 = arith.addi %8, %c1 : index
          memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @_reverse_dim0_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32, #spirv.storage_class<StorageBuffer>>{%c2}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %2 = arith.muli %workgroup_id_x, %c64 : index
    %3 = arith.muli %workgroup_count_x, %c64 : index
    %4 = gpu.thread_id  x
    %5 = gpu.block_dim  x
    scf.for %arg0 = %workgroup_id_y to %c2 step %workgroup_count_y {
      scf.for %arg1 = %2 to %c1 step %3 {
        scf.for %arg2 = %4 to %c1 step %5 {
          %6 = arith.addi %arg0, %arg1 : index
          %7 = memref.load %0[%6] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
          %8 = arith.subi %arg1, %arg0 : index
          %9 = arith.addi %8, %c1 : index
          memref.store %7, %1[%9] : memref<?xf32, #spirv.storage_class<StorageBuffer>>
        }
      }
    }
    return
  }
 }

 // -----// IR Dump After ConvertToSPIRV (iree-convert-to-spirv) //----- //
 module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
  spirv.module Logical GLSL450 {
    spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
    spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
    spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
    spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} {
      %cst64_i32 = spirv.Constant 64 : i32
      %cst2_i32 = spirv.Constant 2 : i32
      %cst1_i32 = spirv.Constant 1 : i32
      %cst0_i32 = spirv.Constant 0 : i32
      %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
      %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
      %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
      %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
      %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
      %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
      %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
      %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
      %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
      %__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
      %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
      %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
      %8 = spirv.IMul %1, %cst64_i32 : i32
      %9 = spirv.IMul %3, %cst64_i32 : i32
      %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
      %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
      %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
      %cst64_i32_2 = spirv.Constant 64 : i32
      spirv.mlir.loop {
        spirv.Branch ^bb1(%5 : i32)
      ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
        %13 = spirv.SLessThan %12, %cst2_i32 : i32
        spirv.BranchConditional %13, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%8 : i32)
        ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
          %16 = spirv.SLessThan %15, %cst1_i32 : i32
          spirv.BranchConditional %16, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spirv.mlir.loop {
            spirv.Branch ^bb1(%11 : i32)
          ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
            %19 = spirv.SLessThan %18, %cst1_i32 : i32
            spirv.BranchConditional %19, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %20 = spirv.IAdd %12, %15 : i32
            %cst0_i32_3 = spirv.Constant 0 : i32
            %cst0_i32_4 = spirv.Constant 0 : i32
            %cst1_i32_5 = spirv.Constant 1 : i32
            %21 = spirv.IMul %cst1_i32_5, %20 : i32
            %22 = spirv.IAdd %cst0_i32_4, %21 : i32
            %23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %24 = spirv.Load "StorageBuffer" %23 : f32
            %25 = spirv.ISub %15, %12 : i32
            %26 = spirv.IAdd %25, %cst1_i32 : i32
            %cst0_i32_6 = spirv.Constant 0 : i32
            %cst0_i32_7 = spirv.Constant 0 : i32
            %cst1_i32_8 = spirv.Constant 1 : i32
            %27 = spirv.IMul %cst1_i32_8, %26 : i32
            %28 = spirv.IAdd %cst0_i32_7, %27 : i32
            %29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            spirv.Store "StorageBuffer" %29, %24 : f32
            %30 = spirv.IAdd %18, %cst64_i32_2 : i32
            spirv.Branch ^bb1(%30 : i32)
          ^bb3:  // pred: ^bb1
            spirv.mlir.merge
          }
          %17 = spirv.IAdd %15, %9 : i32
          spirv.Branch ^bb1(%17 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %14 = spirv.IAdd %12, %7 : i32
        spirv.Branch ^bb1(%14 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      spirv.Return
    }
  }
 }

 // -----// IR Dump After SPIRVUnifyAliasedResourcePass (spirv-unify-aliased-resource) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [64, 1, 1]>} {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    %cst64_i32_2 = spirv.Constant 64 : i32
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %cst0_i32_3 = spirv.Constant 0 : i32
          %cst0_i32_4 = spirv.Constant 0 : i32
          %cst1_i32_5 = spirv.Constant 1 : i32
          %21 = spirv.IMul %cst1_i32_5, %20 : i32
          %22 = spirv.IAdd %cst0_i32_4, %21 : i32
          %23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %24 = spirv.Load "StorageBuffer" %23 : f32
          %25 = spirv.ISub %15, %12 : i32
          %26 = spirv.IAdd %25, %cst1_i32 : i32
          %cst0_i32_6 = spirv.Constant 0 : i32
          %cst0_i32_7 = spirv.Constant 0 : i32
          %cst1_i32_8 = spirv.Constant 1 : i32
          %27 = spirv.IMul %cst1_i32_8, %26 : i32
          %28 = spirv.IAdd %cst0_i32_7, %27 : i32
          %29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %29, %24 : f32
          %30 = spirv.IAdd %18, %cst64_i32_2 : i32
          spirv.Branch ^bb1(%30 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
 }

 // -----// IR Dump After SPIRVLowerABIAttributesPass (spirv-lower-abi-attrs) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    %cst64_i32_2 = spirv.Constant 64 : i32
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %cst0_i32_3 = spirv.Constant 0 : i32
          %cst0_i32_4 = spirv.Constant 0 : i32
          %cst1_i32_5 = spirv.Constant 1 : i32
          %21 = spirv.IMul %cst1_i32_5, %20 : i32
          %22 = spirv.IAdd %cst0_i32_4, %21 : i32
          %23 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32_3, %22] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %24 = spirv.Load "StorageBuffer" %23 : f32
          %25 = spirv.ISub %15, %12 : i32
          %26 = spirv.IAdd %25, %cst1_i32 : i32
          %cst0_i32_6 = spirv.Constant 0 : i32
          %cst0_i32_7 = spirv.Constant 0 : i32
          %cst1_i32_8 = spirv.Constant 1 : i32
          %27 = spirv.IMul %cst1_i32_8, %26 : i32
          %28 = spirv.IAdd %cst0_i32_7, %27 : i32
          %29 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32_6, %28] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %29, %24 : f32
          %30 = spirv.IAdd %18, %cst64_i32_2 : i32
          spirv.Branch ^bb1(%30 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr_1 = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr_1 : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %22 = spirv.Load "StorageBuffer" %21 : f32
          %23 = spirv.ISub %15, %12 : i32
          %24 = spirv.IAdd %23, %cst1_i32 : i32
          %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %25, %22 : f32
          %26 = spirv.IAdd %18, %cst64_i32 : i32
          spirv.Branch ^bb1(%26 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After CSE (cse) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %22 = spirv.Load "StorageBuffer" %21 : f32
          %23 = spirv.ISub %15, %12 : i32
          %24 = spirv.IAdd %23, %cst1_i32 : i32
          %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %25, %22 : f32
          %26 = spirv.IAdd %18, %cst64_i32 : i32
          spirv.Branch ^bb1(%26 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After SPIRVRewriteInsertsPass (spirv-rewrite-inserts) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %22 = spirv.Load "StorageBuffer" %21 : f32
          %23 = spirv.ISub %15, %12 : i32
          %24 = spirv.IAdd %23, %cst1_i32 : i32
          %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %25, %22 : f32
          %26 = spirv.IAdd %18, %cst64_i32 : i32
          spirv.Branch ^bb1(%26 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After SPIRVCanonicalizeGLPass (spirv-canonicalize-gl) //----- //
 spirv.module Logical GLSL450 {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %22 = spirv.Load "StorageBuffer" %21 : f32
          %23 = spirv.ISub %15, %12 : i32
          %24 = spirv.IAdd %23, %cst1_i32 : i32
          %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %25, %22 : f32
          %26 = spirv.IAdd %18, %cst64_i32 : i32
          spirv.Branch ^bb1(%26 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After SPIRVUpdateVCEPass (spirv-update-vce) //----- //
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
  spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spirv.func @_reverse_dim0_dispatch_0() "None" {
    %cst64_i32 = spirv.Constant 64 : i32
    %cst2_i32 = spirv.Constant 2 : i32
    %cst1_i32 = spirv.Constant 1 : i32
    %cst0_i32 = spirv.Constant 0 : i32
    %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
    %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
    %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
    %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
    %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
    %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
    %8 = spirv.IMul %1, %cst64_i32 : i32
    %9 = spirv.IMul %3, %cst64_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
    %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
    spirv.mlir.loop {
      spirv.Branch ^bb1(%5 : i32)
    ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
      %13 = spirv.SLessThan %12, %cst2_i32 : i32
      spirv.BranchConditional %13, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spirv.mlir.loop {
        spirv.Branch ^bb1(%8 : i32)
      ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
        %16 = spirv.SLessThan %15, %cst1_i32 : i32
        spirv.BranchConditional %16, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spirv.mlir.loop {
          spirv.Branch ^bb1(%11 : i32)
        ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
          %19 = spirv.SLessThan %18, %cst1_i32 : i32
          spirv.BranchConditional %19, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %20 = spirv.IAdd %12, %15 : i32
          %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %22 = spirv.Load "StorageBuffer" %21 : f32
          %23 = spirv.ISub %15, %12 : i32
          %24 = spirv.IAdd %23, %cst1_i32 : i32
          %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          spirv.Store "StorageBuffer" %25, %22 : f32
          %26 = spirv.IAdd %18, %cst64_i32 : i32
          spirv.Branch ^bb1(%26 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        %17 = spirv.IAdd %15, %9 : i32
        spirv.Branch ^bb1(%17 : i32)
      ^bb3:  // pred: ^bb1
        spirv.mlir.merge
      }
      %14 = spirv.IAdd %12, %7 : i32
      spirv.Branch ^bb1(%14 : i32)
    ^bb3:  // pred: ^bb1
      spirv.mlir.merge
    }
    spirv.Return
  }
  spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
 hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
  hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
    hal.return %0, %arg1, %c1 : index, index, index
  }
  builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
    spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
      spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
      spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
      spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
      spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      spirv.func @_reverse_dim0_dispatch_0() "None" {
        %cst64_i32 = spirv.Constant 64 : i32
        %cst2_i32 = spirv.Constant 2 : i32
        %cst1_i32 = spirv.Constant 1 : i32
        %cst0_i32 = spirv.Constant 0 : i32
        %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
        %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
        %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
        %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
        %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
        %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
        %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
        %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
        %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
        %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
        %8 = spirv.IMul %1, %cst64_i32 : i32
        %9 = spirv.IMul %3, %cst64_i32 : i32
        %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
        %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
        %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
        spirv.mlir.loop {
          spirv.Branch ^bb1(%5 : i32)
        ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
          %13 = spirv.SLessThan %12, %cst2_i32 : i32
          spirv.BranchConditional %13, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spirv.mlir.loop {
            spirv.Branch ^bb1(%8 : i32)
          ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
            %16 = spirv.SLessThan %15, %cst1_i32 : i32
            spirv.BranchConditional %16, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spirv.mlir.loop {
              spirv.Branch ^bb1(%11 : i32)
            ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
              %19 = spirv.SLessThan %18, %cst1_i32 : i32
              spirv.BranchConditional %19, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %20 = spirv.IAdd %12, %15 : i32
              %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %22 = spirv.Load "StorageBuffer" %21 : f32
              %23 = spirv.ISub %15, %12 : i32
              %24 = spirv.IAdd %23, %cst1_i32 : i32
              %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              spirv.Store "StorageBuffer" %25, %22 : f32
              %26 = spirv.IAdd %18, %cst64_i32 : i32
              spirv.Branch ^bb1(%26 : i32)
            ^bb3:  // pred: ^bb1
              spirv.mlir.merge
            }
            %17 = spirv.IAdd %15, %9 : i32
            spirv.Branch ^bb1(%17 : i32)
          ^bb3:  // pred: ^bb1
            spirv.mlir.merge
          }
          %14 = spirv.IAdd %12, %7 : i32
          spirv.Branch ^bb1(%14 : i32)
        ^bb3:  // pred: ^bb1
          spirv.mlir.merge
        }
        spirv.Return
      }
      spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
      spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
    }
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass (iree-hal-translate-executables) //----- //
 hal.executable public @_reverse_dim0_dispatch_0 {
  hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
    hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [64 : index, 1 : index, 1 : index]} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
      %c1 = arith.constant 1 : index
      %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
      hal.return %0, %arg1, %c1 : index, index, index
    }
    builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
      spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
        spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
        spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
        spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        spirv.func @_reverse_dim0_dispatch_0() "None" {
          %cst64_i32 = spirv.Constant 64 : i32
          %cst2_i32 = spirv.Constant 2 : i32
          %cst1_i32 = spirv.Constant 1 : i32
          %cst0_i32 = spirv.Constant 0 : i32
          %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
          %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
          %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
          %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
          %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
          %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
          %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
          %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
          %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
          %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
          %8 = spirv.IMul %1, %cst64_i32 : i32
          %9 = spirv.IMul %3, %cst64_i32 : i32
          %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
          %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
          %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
          spirv.mlir.loop {
            spirv.Branch ^bb1(%5 : i32)
          ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
            %13 = spirv.SLessThan %12, %cst2_i32 : i32
            spirv.BranchConditional %13, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spirv.mlir.loop {
              spirv.Branch ^bb1(%8 : i32)
            ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
              %16 = spirv.SLessThan %15, %cst1_i32 : i32
              spirv.BranchConditional %16, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spirv.mlir.loop {
                spirv.Branch ^bb1(%11 : i32)
              ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
                %19 = spirv.SLessThan %18, %cst1_i32 : i32
                spirv.BranchConditional %19, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %20 = spirv.IAdd %12, %15 : i32
                %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %22 = spirv.Load "StorageBuffer" %21 : f32
                %23 = spirv.ISub %15, %12 : i32
                %24 = spirv.IAdd %23, %cst1_i32 : i32
                %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                spirv.Store "StorageBuffer" %25, %22 : f32
                %26 = spirv.IAdd %18, %cst64_i32 : i32
                spirv.Branch ^bb1(%26 : i32)
              ^bb3:  // pred: ^bb1
                spirv.mlir.merge
              }
              %17 = spirv.IAdd %15, %9 : i32
              spirv.Branch ^bb1(%17 : i32)
            ^bb3:  // pred: ^bb1
              spirv.mlir.merge
            }
            %14 = spirv.IAdd %12, %7 : i32
            spirv.Branch ^bb1(%14 : i32)
          ^bb3:  // pred: ^bb1
            spirv.mlir.merge
          }
          spirv.Return
        }
        spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
      }
    }
  }
 }

 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>
 #map = affine_map<()[s0] -> (s0 ceildiv 64)>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
 #translation = #iree_codegen.translation_info<SPIRVBaseDistribute>
 module {
  hal.executable public @_reverse_dim0_dispatch_0 {
    hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb {
      hal.executable.export public @_reverse_dim0_dispatch_0 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation, workgroup_size = [64 : index, 1 : index, 1 : index]} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg2]
        hal.return %0, %arg1, %c1 : index, index, index
      }
      builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>} {
        spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spirv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spirv.ptr<vector<3xi32>, Input>
          spirv.GlobalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spirv.ptr<vector<3xi32>, Input>
          spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi32>, Input>
          spirv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spirv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spirv.func @_reverse_dim0_dispatch_0() "None" {
            %cst64_i32 = spirv.Constant 64 : i32
            %cst2_i32 = spirv.Constant 2 : i32
            %cst1_i32 = spirv.Constant 1 : i32
            %cst0_i32 = spirv.Constant 0 : i32
            %__resource_var_0_0__addr = spirv.mlir.addressof @__resource_var_0_0_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spirv.mlir.addressof @__resource_var_0_1_ : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi32>, Input>
            %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %__builtin_var_NumWorkgroups___addr = spirv.mlir.addressof @__builtin_var_NumWorkgroups__ : !spirv.ptr<vector<3xi32>, Input>
            %2 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
            %3 = spirv.CompositeExtract %2[0 : i32] : vector<3xi32>
            %4 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spirv.CompositeExtract %4[1 : i32] : vector<3xi32>
            %6 = spirv.Load "Input" %__builtin_var_NumWorkgroups___addr : vector<3xi32>
            %7 = spirv.CompositeExtract %6[1 : i32] : vector<3xi32>
            %8 = spirv.IMul %1, %cst64_i32 : i32
            %9 = spirv.IMul %3, %cst64_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spirv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spirv.ptr<vector<3xi32>, Input>
            %10 = spirv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %11 = spirv.CompositeExtract %10[0 : i32] : vector<3xi32>
            spirv.mlir.loop {
              spirv.Branch ^bb1(%5 : i32)
            ^bb1(%12: i32):  // 2 preds: ^bb0, ^bb2
              %13 = spirv.SLessThan %12, %cst2_i32 : i32
              spirv.BranchConditional %13, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spirv.mlir.loop {
                spirv.Branch ^bb1(%8 : i32)
              ^bb1(%15: i32):  // 2 preds: ^bb0, ^bb2
                %16 = spirv.SLessThan %15, %cst1_i32 : i32
                spirv.BranchConditional %16, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spirv.mlir.loop {
                  spirv.Branch ^bb1(%11 : i32)
                ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
                  %19 = spirv.SLessThan %18, %cst1_i32 : i32
                  spirv.BranchConditional %19, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %20 = spirv.IAdd %12, %15 : i32
                  %21 = spirv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %20] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %22 = spirv.Load "StorageBuffer" %21 : f32
                  %23 = spirv.ISub %15, %12 : i32
                  %24 = spirv.IAdd %23, %cst1_i32 : i32
                  %25 = spirv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %24] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  spirv.Store "StorageBuffer" %25, %22 : f32
                  %26 = spirv.IAdd %18, %cst64_i32 : i32
                  spirv.Branch ^bb1(%26 : i32)
                ^bb3:  // pred: ^bb1
                  spirv.mlir.merge
                }
                %17 = spirv.IAdd %15, %9 : i32
                spirv.Branch ^bb1(%17 : i32)
              ^bb3:  // pred: ^bb1
                spirv.mlir.merge
              }
              %14 = spirv.IAdd %12, %7 : i32
              spirv.Branch ^bb1(%14 : i32)
            ^bb3:  // pred: ^bb1
              spirv.mlir.merge
            }
            spirv.Return
          }
          spirv.EntryPoint "GLCompute" @_reverse_dim0_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
          spirv.ExecutionMode @_reverse_dim0_dispatch_0 "LocalSize", 64, 1, 1
        }
      }
    }
  }
 }