pashu123 · January 13, 2025 08:04
diff --git a/with.txt b/with.txt
 // -----// IR Dump After ConvolutionToIGEMMPass (iree-codegen-convolution-to-igemm) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
  %6 = tensor.empty() : tensor<2x17x17x11529xf16>
  %7 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [0, 0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%6 : tensor<2x17x17x11529xf16>) -> tensor<2x17x17x11529xf16>
  %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<2x17x17x11529xf16>, tensor<11529x1281xf16>) outs(%5 : tensor<2x17x17x1281xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
  ^bb0(%in: f16, %in_0: f16, %out: f32):
    %10 = arith.extf %in : f16 to f32
    %11 = arith.extf %in_0 : f16 to f32
    %12 = arith.mulf %10, %11 : f32
    %13 = arith.addf %12, %out : f32
    linalg.yield %13 : f32
  } -> tensor<2x17x17x1281xf32>
  flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConvertAccGEMMToGEMMPass (iree-convert-accgemm-to-gemm) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
  %6 = tensor.empty() : tensor<2x17x17x11529xf16>
  %7 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [0, 0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%6 : tensor<2x17x17x11529xf16>) -> tensor<2x17x17x11529xf16>
  %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<2x17x17x11529xf16>, tensor<11529x1281xf16>) outs(%5 : tensor<2x17x17x1281xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
  ^bb0(%in: f16, %in_0: f16, %out: f32):
    %10 = arith.extf %in : f16 to f32
    %11 = arith.extf %in_0 : f16 to f32
    %12 = arith.mulf %10, %11 : f32
    %13 = arith.addf %12, %out : f32
    linalg.yield %13 : f32
  } -> tensor<2x17x17x1281xf32>
  flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c17 = arith.constant 17 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
    ^bb0(%in: f16, %in_1: f16, %out: f32):
      %12 = arith.extf %in : f16 to f32
      %13 = arith.extf %in_1 : f16 to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x1x17x?xf32>
    %cast = tensor.cast %11 : tensor<2x1x17x?xf32> to tensor<2x1x?x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %cast into %arg2[%c0, %arg0, %c0, %arg1] [2, 1, %c17, %7] [1, 1, 1, 1] : tensor<2x1x?x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
    ^bb0(%in: f16, %in_1: f16, %out: f32):
      %12 = arith.extf %in : f16 to f32
      %13 = arith.extf %in_1 : f16 to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %11 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
    ^bb0(%in: f16, %in_1: f16, %out: f32):
      %12 = arith.extf %in : f16 to f32
      %13 = arith.extf %in_1 : f16 to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %11 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUPadOperandsPass (iree-codegen-gpu-pad-operands) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %cst_1 = arith.constant 0.000000e+00 : f16
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst_1 : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %cst_2 = arith.constant 0.000000e+00 : f16
    %c1 = arith.constant 1 : index
    %11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_3 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst_2 : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %cst_4 = arith.constant 0.000000e+00 : f32
    %c3 = arith.constant 3 : index
    %dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
    %12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
    %padded_5 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst_4 : f32
    } : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
    %c1_6 = arith.constant 1 : index
    %c3_7 = arith.constant 3 : index
    %dim_8 = tensor.dim %10, %c3_7 : tensor<2x1x17x?xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%padded, %padded_3 : tensor<2x1x32x11536xf16>, tensor<11536x16xf16>) outs(%padded_5 : tensor<2x1x32x16xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
    ^bb0(%in: f16, %in_10: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      %15 = arith.extf %in_10 : f16 to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x1x32x16xf32>
    %extracted_slice_9 = tensor.extract_slice %13[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %extracted_slice_9 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %cst_1 = arith.constant 0.000000e+00 : f16
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst_1 : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %cst_2 = arith.constant 0.000000e+00 : f16
    %c1 = arith.constant 1 : index
    %11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_3 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst_2 : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %cst_4 = arith.constant 0.000000e+00 : f32
    %c3 = arith.constant 3 : index
    %dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
    %12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
    %padded_5 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst_4 : f32
    } : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
    %c1_6 = arith.constant 1 : index
    %c3_7 = arith.constant 3 : index
    %dim_8 = tensor.dim %10, %c3_7 : tensor<2x1x17x?xf32>
    %13 = tensor.empty() : tensor<2x1x32x11536xf16>
    %14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
    %15 = tensor.empty() : tensor<11536x16xf16>
    %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_3 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%14, %16 : tensor<2x1x32x11536xf16>, tensor<11536x16xf16>) outs(%padded_5 : tensor<2x1x32x16xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
    ^bb0(%in: f16, %in_11: f16, %out: f32):
      %22 = arith.extf %in : f16 to f32
      %23 = arith.extf %in_11 : f16 to f32
      %24 = arith.mulf %22, %23 : f32
      %25 = arith.addf %24, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x1x32x16xf32>
    %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %19 = linalg.copy ins(%17 : tensor<2x1x32x16xf32>) outs(%18 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_9 = tensor.extract_slice %19[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %c3_10 = arith.constant 3 : index
    %20 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_9 : tensor<2x1x17x?xf32>) outs(%20 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %21 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUPackToIntrinsicsPass (iree-codegen-gpu-pack-to-intrinsics) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %10 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_1 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_2 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
    %12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
    %padded_3 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst_0 : f32
    } : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
    %13 = tensor.empty() : tensor<2x1x32x11536xf16>
    %14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
    %15 = tensor.empty() : tensor<11536x16xf16>
    %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_2 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
    %17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
    %pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
    %18 = tensor.empty() : tensor<721x1x16x16xf16>
    %pack_4 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
    %19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %pack_5 = tensor.pack %padded_3 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %19 : tensor<2x1x32x16xf32> -> tensor<2x1x2x1x16x16xf32>
    %20 = iree_gpu.multi_mma %pack, %pack_4, %pack_5 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
    %unpack = tensor.unpack %20 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %padded_3 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %21 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %22 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%21 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_6 = tensor.extract_slice %22[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %23 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_6 : tensor<2x1x17x?xf32>) outs(%23 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %24 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After DecomposeBoundaryPackUnPackOpsPass (iree-codegen-decompose-boundary-pack-unpack-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %13 = tensor.empty() : tensor<2x1x32x11536xf16>
    %14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
    %15 = tensor.empty() : tensor<11536x16xf16>
    %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
    %17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
    %pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
    %18 = tensor.empty() : tensor<721x1x16x16xf16>
    %pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
    %19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
    %unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %13 = tensor.empty() : tensor<2x1x32x11536xf16>
    %14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
    %15 = tensor.empty() : tensor<11536x16xf16>
    %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
    %17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
    %pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
    %18 = tensor.empty() : tensor<721x1x16x16xf16>
    %pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
    %19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
    %unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %13 = tensor.empty() : tensor<2x1x32x11536xf16>
    %14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
    %15 = tensor.empty() : tensor<11536x16xf16>
    %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
    %17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
    %pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
    %18 = tensor.empty() : tensor<721x1x16x16xf16>
    %pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
    %19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
    %unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %13 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %14 = linalg.fill ins(%cst_0 : f32) outs(%13 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %15 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %14) -> (tensor<2x1x2x1x16x16xf32>) {
      %20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %20] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %21 = tensor.empty() : tensor<2x1x32x16xf16>
      %22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%21 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %23 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %pack = tensor.pack %22 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %23 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
      %24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %padded_1[%24, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %25 = tensor.empty() : tensor<16x16xf16>
      %26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %27 = tensor.empty() : tensor<1x1x16x16xf16>
      %pack_5 = tensor.pack %26 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %27 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
      %28 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %28 : tensor<2x1x2x1x16x16xf32>
    }
    %unpack = tensor.unpack %15 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %17 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%16 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %17[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %18 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%18 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %19 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
      %19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %19] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %20 = tensor.empty() : tensor<2x1x32x16xf16>
      %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%20 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %22 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %pack = tensor.pack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %22 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
      %23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %padded_1[%23, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %24 = tensor.empty() : tensor<16x16xf16>
      %25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%24 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %26 = tensor.empty() : tensor<1x1x16x16xf16>
      %pack_5 = tensor.pack %25 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %26 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
      %27 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %27 : tensor<2x1x2x1x16x16xf32>
    }
    %unpack = tensor.unpack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %11 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %16 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%15 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %17 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %18 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%17 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
      %19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %19] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %20 = tensor.empty() : tensor<2x1x32x16xf16>
      %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%20 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %22 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %pack = tensor.pack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %22 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
      %extracted_slice_4 = tensor.extract_slice %padded_1[%19, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %23 = tensor.empty() : tensor<16x16xf16>
      %24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%23 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %25 = tensor.empty() : tensor<1x1x16x16xf16>
      %pack_5 = tensor.pack %24 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %25 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
      %26 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %26 : tensor<2x1x2x1x16x16xf32>
    }
    %unpack = tensor.unpack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %11 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %16 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%15 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %17 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %18 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%17 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
      %21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %22 = tensor.empty() : tensor<2x1x32x16xf16>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %extracted_slice_5 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %25 = tensor.empty() : tensor<16x16xf16>
      %26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %27 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_6 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %28 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %28 : tensor<2x1x2x1x16x16xf32>
    }
    %15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%14 : tensor<2x1x2x1x16x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %16 = linalg.copy ins(%collapsed : tensor<2x1x32x16xf32>) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %18 = linalg.copy ins(%16 : tensor<2x1x32x16xf32>) outs(%17 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %18[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x32x16xf32>
    %12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
      %21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %22 = tensor.empty() : tensor<2x1x32x16xf16>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %extracted_slice_5 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %25 = tensor.empty() : tensor<16x16xf16>
      %26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %27 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_6 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %28 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %28 : tensor<2x1x2x1x16x16xf32>
    }
    %15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%14 : tensor<2x1x2x1x16x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %16 = linalg.copy ins(%collapsed : tensor<2x1x32x16xf32>) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %18 = linalg.copy ins(%16 : tensor<2x1x32x16xf32>) outs(%17 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %18[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
      %21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %22 = tensor.empty() : tensor<2x1x32x16xf16>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded_4 = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %extracted_slice_6 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %25 = tensor.empty() : tensor<16x16xf16>
      %26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_6 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %27 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_7 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_8 = linalg.transpose ins(%expanded_7 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %28 = iree_gpu.multi_mma %transposed_5, %transposed_8, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %28 : tensor<2x1x2x1x16x16xf32>
    }
    %14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%transposed : tensor<2x1x2x16x1x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x1x2x16x1x16xf32>
    %17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %expanded = tensor.expand_shape %17 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf32> into tensor<2x1x2x16x1x16xf32>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x1x2x16x1x16xf32>) outs(%expanded : tensor<2x1x2x16x1x16xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x1x2x16x1x16xf32>
    %collapsed = tensor.collapse_shape %18 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
      %18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %18] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %19 = tensor.empty() : tensor<2x1x32x16xf16>
      %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%19 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %21 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%21 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %extracted_slice_5 = tensor.extract_slice %padded_1[%18, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %22 = tensor.empty() : tensor<16x16xf16>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%22 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %24 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_6 = tensor.expand_shape %23 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%24 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %25 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %25 : tensor<2x1x2x1x16x16xf32>
    }
    %14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %16 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %17 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%16 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x17x11529xf16>
    %9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
    %extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
    %padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
    ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
      tensor.yield %cst : f16
    } : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
    %10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
    %padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f16
    } : tensor<11529x?xf16> to tensor<11536x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
      %18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
      %extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %18] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
      %19 = tensor.empty() : tensor<2x1x32x16xf16>
      %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%19 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
      %21 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%21 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %extracted_slice_5 = tensor.extract_slice %padded_1[%18, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
      %22 = tensor.empty() : tensor<16x16xf16>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%22 : tensor<16x16xf16>) -> tensor<16x16xf16>
      %24 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_6 = tensor.expand_shape %23 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%24 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %25 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %25 : tensor<2x1x2x1x16x16xf32>
    }
    %14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
    %16 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %17 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%16 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.min affine_map<(d0) -> (d0, 2)>(%arg5)
        %23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.min affine_map<(d0) -> (d0, 1)>(%arg6)
        %26 = affine.min affine_map<(d0) -> (-d0 + 1, 1)>(%25)
        %27 = arith.cmpi eq, %26, %c0 : index
        %28 = arith.ori %27, %24 : i1
        %29 = affine.min affine_map<(d0) -> (d0, 17)>(%arg7)
        %30 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%29)
        %31 = arith.cmpi eq, %30, %c0 : index
        %32 = arith.ori %31, %28 : i1
        %33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%30)
        %34 = affine.min affine_map<(d0)[s0] -> (d0 * 16 + s0, 11529)>(%arg3)[%arg8]
        %35 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%34)
        %36 = arith.cmpi eq, %35, %c0 : index
        %37 = arith.ori %36, %32 : i1
        %38 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%35)
        %39 = scf.if %37 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %41 = tensor.empty(%23, %26, %30, %35) : tensor<?x?x?x?xf16>
          %42 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg0)
          %43 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%42, %29] * [17, 1] k_offset = [%34] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%41 : tensor<?x?x?x?xf16>) -> tensor<?x?x?x?xf16>
          %padded = tensor.pad %43 low[0, 0, 0, 0] high[0, 0, %33, %38] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x?x?x?xf16> to tensor<?x?x?x?xf16>
          %cast = tensor.cast %padded : tensor<?x?x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %cast : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %40 into %arg9[%arg5, %arg6, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %18 = tensor.empty() : tensor<16x16xf16>
      %19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (d0 * 16 + s0, 11529)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%arg6, %7)
        %27 = affine.min affine_map<(d0, d1) -> (d0 - d1, 1)>(%7, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
          %extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<?x?xf16>
          %cast = tensor.cast %padded : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %cast : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %20 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %26 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%25)
        %27 = arith.cmpi eq, %26, %c0 : index
        %28 = arith.ori %27, %24 : i1
        %29 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%26)
        %30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
        %31 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%30)
        %32 = arith.cmpi eq, %31, %c0 : index
        %33 = arith.ori %32, %28 : i1
        %34 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%31)
        %35 = scf.if %33 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %37 = tensor.empty(%23, %26, %31) : tensor<?x1x?x?xf16>
          %38 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %25] * [17, 1] k_offset = [%30] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%37 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %38 low[0, 0, 0, 0] high[0, 0, %29, %34] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %36 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%35 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %36 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %18 = tensor.empty() : tensor<16x16xf16>
      %19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
          %extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %20 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %26 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%25)
        %27 = arith.cmpi eq, %26, %c0 : index
        %28 = arith.ori %27, %24 : i1
        %29 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%26)
        %30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
        %31 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%30)
        %32 = arith.cmpi eq, %31, %c0 : index
        %33 = arith.ori %32, %28 : i1
        %34 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%31)
        %35 = scf.if %33 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %37 = tensor.empty(%23, %26, %31) : tensor<?x1x?x?xf16>
          %38 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %25] * [17, 1] k_offset = [%30] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%37 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %38 low[0, 0, 0, 0] high[0, 0, %29, %34] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %36 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%35 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %36 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
      %18 = tensor.empty() : tensor<16x16xf16>
      %19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
          %extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %20 = tensor.empty() : tensor<1x1x16x16xf16>
      %expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
      %21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = arith.ori %25, %22 : i1
        %27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
        %28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
        %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
        %30 = arith.cmpi eq, %29, %c0 : index
        %31 = arith.ori %30, %26 : i1
        %32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
        %33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
          %36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %17 = tensor.empty() : tensor<16x16xf16>
      %18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
        %20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
        %24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
        %25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
        %26 = arith.cmpi eq, %25, %c0 : index
        %27 = arith.ori %26, %22 : i1
        %28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
        %29 = scf.if %27 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
          %extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x16x1x16xf16>
        %20 = tensor.empty() : tensor<1x1x1x1x16x16xf16>
        %transposed_2 = linalg.transpose ins(%extracted_slice : tensor<1x1x1x16x1x16xf16>) outs(%20 : tensor<1x1x1x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5] 
        %extracted_slice_3 = tensor.extract_slice %expanded_1[0, 0, %arg8, 0] [1, 16, 1, 16] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x16x1x16xf16>
        %21 = tensor.empty() : tensor<1x1x16x16xf16>
        %transposed_4 = linalg.transpose ins(%extracted_slice_3 : tensor<1x16x1x16xf16>) outs(%21 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1] 
        %extracted_slice_5 = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %22 = iree_gpu.multi_mma %transposed_2, %transposed_4, %extracted_slice_5 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<1x1x1x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<1x1x1x1x16x16xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %19 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After DistributeMmaToLanesPass (iree-gpu-distribute-mma-to-lanes) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = arith.ori %25, %22 : i1
        %27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
        %28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
        %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
        %30 = arith.cmpi eq, %29, %c0 : index
        %31 = arith.ori %30, %26 : i1
        %32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
        %33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
          %36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %17 = tensor.empty() : tensor<16x16xf16>
      %18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
        %20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
        %24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
        %25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
        %26 = arith.cmpi eq, %25, %c0 : index
        %27 = arith.ori %26, %22 : i1
        %28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
        %29 = scf.if %27 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
          %extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %20 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %21 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %21, 0, %22] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %23 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%23 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, %arg8, %25] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %27 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
          %28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %29 into %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %19 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPULowerToUKernelsPass (iree-codegen-gpu-lower-to-ukernels) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
    %8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
      %15 = tensor.empty() : tensor<2x1x32x16xf16>
      %16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
        %20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = arith.ori %25, %22 : i1
        %27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
        %28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
        %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
        %30 = arith.cmpi eq, %29, %c0 : index
        %31 = arith.ori %30, %26 : i1
        %32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
        %33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
          %36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %17 = tensor.empty() : tensor<16x16xf16>
      %18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
        %20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
        %22 = arith.cmpi eq, %21, %c0 : index
        %23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
        %24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
        %25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
        %26 = arith.cmpi eq, %25, %c0 : index
        %27 = arith.ori %26, %22 : i1
        %28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
        %29 = scf.if %27 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
          %extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %20 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %21 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %21, 0, %22] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %23 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%23 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, %arg8, %25] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %27 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
          %28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %29 into %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %19 : tensor<2x1x2x1x16x16xf32>
    }
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
    %14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
      %15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
    %9 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%7)
    %10 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %11 = linalg.fill ins(%cst_0 : f32) outs(%10 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %12 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %11) -> (tensor<2x1x2x1x16x16xf32>) {
      %18 = tensor.empty() : tensor<2x1x32x16xf16>
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %18) -> (tensor<2x1x32x16xf16>) {
        %23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
        %24 = affine.apply affine_map<(d0) -> (d0)>(%arg7)
        %25 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
        %26 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
        %27 = affine.min affine_map<(d0) -> (2, d0)>(%26)
        %28 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%27)
        %29 = arith.cmpi eq, %28, %c0 : index
        %30 = affine.min affine_map<(d0) -> (17, d0)>(%24)
        %31 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%30)
        %32 = arith.cmpi eq, %31, %c0 : index
        %33 = arith.ori %32, %29 : i1
        %34 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
        %35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%23]
        %36 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%35)
        %37 = arith.cmpi eq, %36, %c0 : index
        %38 = arith.ori %37, %33 : i1
        %39 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%36)
        %40 = scf.if %38 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%27, 0, 0, 0] [%28, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %42 = tensor.empty(%28, %31, %36) : tensor<?x1x?x?xf16>
          %43 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%8, %30] * [17, 1] k_offset = [%35] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%42 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %43 low[0, 0, 0, 0] high[0, 0, %34, %39] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%26, 0, %24, %23] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %41 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%40 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %41 into %arg9[%26, 0, %24, %23] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %20 = tensor.empty() : tensor<16x16xf16>
      %21 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %20) -> (tensor<16x16xf16>) {
        %23 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %24 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
        %27 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %7)
        %28 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%9, %27)
        %29 = arith.cmpi eq, %28, %c0 : index
        %30 = arith.ori %29, %25 : i1
        %31 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%28)
        %32 = scf.if %30 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%27, %7]
          %extracted_slice_2 = tensor.extract_slice %5[%23, %34] [%24, %28] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%26, %31] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %33 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%32 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %33 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %21 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %22 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %23 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %24, 0, %25] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %26 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%26 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %27 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %27, %arg8, %28] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %29 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%29 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %30 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
          %31 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %30, %31] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %32 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %32 into %arg11[0, 0, 0, 0, %30, %31] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %22 : tensor<2x1x2x1x16x16xf32>
    }
    %13 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%12 : tensor<2x1x2x1x16x16xf32>) outs(%13 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %15 = tensor.empty(%9) : tensor<2x1x17x?xf32>
    %16 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%9)
    %17 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %16) shared_outs(%arg7 = %15) -> (tensor<2x1x17x?xf32>) {
      %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %19 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
      %20 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
      %21 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
      %22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%18)[%9]
      %extracted_slice = tensor.extract_slice %collapsed[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %23 into %arg7[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %17 into %arg2[0, %8, 0, %7] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %11 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
      %17 = tensor.empty() : tensor<2x1x32x16xf16>
      %18 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %17) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
        %23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
        %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
        %33 = arith.cmpi eq, %32, %c0 : index
        %34 = arith.ori %33, %29 : i1
        %35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
        %36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
          %39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %19 = tensor.empty() : tensor<16x16xf16>
      %20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %19) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
          %extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %26 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %27 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %26, 0, %27] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %28 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %29 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %30 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %29, %30] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %31 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %31 into %arg11[0, 0, 0, 0, %29, %30] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%11 : tensor<2x1x2x1x16x16xf32>) outs(%12 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %11 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
      %17 = tensor.empty() : tensor<2x1x32x16xf16>
      %18 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %17) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
        %23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
        %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
        %33 = arith.cmpi eq, %32, %c0 : index
        %34 = arith.ori %33, %29 : i1
        %35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
        %36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
          %39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %19 = tensor.empty() : tensor<16x16xf16>
      %20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %19) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
          %extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%11 : tensor<2x1x2x1x16x16xf32>) outs(%12 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %11 = tensor.empty() : tensor<2x1x32x16xf16>
    %12 = tensor.empty() : tensor<16x16xf16>
    %13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %11) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
        %23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
        %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
        %33 = arith.cmpi eq, %32, %c0 : index
        %34 = arith.ori %33, %29 : i1
        %35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
        %36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
          %39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %12) -> (tensor<16x16xf16>) {
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
          %extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %16 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %17 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %18 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %17) shared_outs(%arg7 = %16) -> (tensor<2x1x17x?xf32>) {
      %19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %20 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %21 into %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
    %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
    %11 = tensor.empty() : tensor<2x1x32x16xf16>
    %12 = tensor.empty() : tensor<16x16xf16>
    %13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
      %19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %11) -> (tensor<2x1x32x16xf16>) {
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
        %23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
        %24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
        %25 = arith.cmpi eq, %24, %c0 : index
        %26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
        %27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
        %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
        %33 = arith.cmpi eq, %32, %c0 : index
        %34 = arith.ori %33, %29 : i1
        %35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
        %36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<1x1x1x4xf16>
          scf.yield %generated : tensor<1x1x1x4xf16>
        } else {
          %extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
          %38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
          %39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
          %padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
          ^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
            tensor.yield %cst : f16
          } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
          scf.yield %padded : tensor<1x1x1x4xf16>
        }
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
        %37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
      %20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %12) -> (tensor<16x16xf16>) {
        %extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
        %22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
        %23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
        %26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
        %27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %24 : i1
        %30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
        %31 = scf.if %29 -> (tensor<1x1xf16>) {
          %generated = tensor.generate  {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<1x1xf16>
          scf.yield %generated : tensor<1x1xf16>
        } else {
          %33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
          %extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
          %padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
          ^bb0(%arg8: index, %arg9: index):
            tensor.yield %cst : f16
          } : tensor<?x?xf16> to tensor<1x1xf16>
          scf.yield %padded : tensor<1x1xf16>
        }
        %32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
        }
      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
      %expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
      %21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
        %extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
        %22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
          %23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
          %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
          %27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.forall.in_parallel {
            tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
          }
        } {mapping = [#iree_gpu.lane_id<0>]}
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
        }
      } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
      scf.yield %21 : tensor<2x1x2x1x16x16xf32>
    }
    %14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5] 
    %15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %16 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %17 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %18 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %17) shared_outs(%arg7 = %16) -> (tensor<2x1x17x?xf32>) {
      %19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %20 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %21 into %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUFuseAndHoistParallelLoopsPass (iree-codegen-gpu-fuse-and-hoist-parallel-loops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c256 = arith.constant 256 : index
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
              %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
              %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
              %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
              %36 = arith.cmpi eq, %35, %c0 : index
              %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
              %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %36 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
              %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
              %44 = arith.cmpi eq, %43, %c0 : index
              %45 = arith.ori %44, %40 : i1
              %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
              %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<1x1x1x4xf16>
                scf.yield %generated : tensor<1x1x1x4xf16>
              } else {
                %extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
                %49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
                %50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
                %padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
                scf.yield %padded : tensor<1x1x1x4xf16>
              }
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
              %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
              scf.yield %inserted_slice : tensor<2x1x32x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:2 = affine.delinearize_index %31 into (16, 16) : index, index
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
              %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
              %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
              %35 = arith.cmpi eq, %34, %c0 : index
              %36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
              %37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
              %38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %35 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = scf.if %40 -> (tensor<1x1xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<1x1xf16>
                scf.yield %generated : tensor<1x1xf16>
              } else {
                %44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
                %extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
                %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<?x?xf16> to tensor<1x1xf16>
                scf.yield %padded : tensor<1x1xf16>
              }
              %43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
              scf.yield %inserted_slice : tensor<16x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %28 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %29 : tensor<1x1x1x1x4x1xf32>
        }
        %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUGreedilyDistributeToThreadsPass (iree-codegen-gpu-greedily-distribute-to-threads) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c256 = arith.constant 256 : index
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
              %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
              %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
              %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
              %36 = arith.cmpi eq, %35, %c0 : index
              %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
              %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %36 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
              %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
              %44 = arith.cmpi eq, %43, %c0 : index
              %45 = arith.ori %44, %40 : i1
              %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
              %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<1x1x1x4xf16>
                scf.yield %generated : tensor<1x1x1x4xf16>
              } else {
                %extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
                %49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
                %50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
                %padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
                scf.yield %padded : tensor<1x1x1x4xf16>
              }
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
              %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
              scf.yield %inserted_slice : tensor<2x1x32x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:2 = affine.delinearize_index %31 into (16, 16) : index, index
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
              %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
              %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
              %35 = arith.cmpi eq, %34, %c0 : index
              %36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
              %37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
              %38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %35 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = scf.if %40 -> (tensor<1x1xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<1x1xf16>
                scf.yield %generated : tensor<1x1xf16>
              } else {
                %44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
                %extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
                %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<?x?xf16> to tensor<1x1xf16>
                scf.yield %padded : tensor<1x1xf16>
              }
              %43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
              scf.yield %inserted_slice : tensor<16x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %28 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %29 : tensor<1x1x1x1x4x1xf32>
        }
        %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After TileLargeTensorsPass (iree-codegen-tile-large-tensors) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c256 = arith.constant 256 : index
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
              %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
              %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
              %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
              %36 = arith.cmpi eq, %35, %c0 : index
              %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
              %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %36 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
              %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
              %44 = arith.cmpi eq, %43, %c0 : index
              %45 = arith.ori %44, %40 : i1
              %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
              %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<1x1x1x4xf16>
                scf.yield %generated : tensor<1x1x1x4xf16>
              } else {
                %extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
                %49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
                %50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
                %padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
                ^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
                  tensor.yield %cst : f16
                } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
                scf.yield %padded : tensor<1x1x1x4xf16>
              }
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
              %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
              scf.yield %inserted_slice : tensor<2x1x32x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
              %31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
              %32:2 = affine.delinearize_index %31 into (16, 16) : index, index
              %extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
              %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
              %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
              %35 = arith.cmpi eq, %34, %c0 : index
              %36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
              %37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
              %38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
              %39 = arith.cmpi eq, %38, %c0 : index
              %40 = arith.ori %39, %35 : i1
              %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
              %42 = scf.if %40 -> (tensor<1x1xf16>) {
                %generated = tensor.generate  {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<1x1xf16>
                scf.yield %generated : tensor<1x1xf16>
              } else {
                %44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
                %extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
                %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
                ^bb0(%arg15: index, %arg16: index):
                  tensor.yield %cst : f16
                } : tensor<?x?xf16> to tensor<1x1xf16>
                scf.yield %padded : tensor<1x1xf16>
              }
              %43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
              scf.yield %inserted_slice : tensor<16x16xf16>
            } {unroll_loop}
            iree_gpu.yield %30 : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %28 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %29 : tensor<1x1x1x1x4x1xf32>
        }
        %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %c3 = arith.constant 3 : index
      %c3_2 = arith.constant 3 : index
      %c0_3 = arith.constant 0 : index
      %c0_4 = arith.constant 0 : index
      %c0_5 = arith.constant 0 : index
      %c0_6 = arith.constant 0 : index
      %c1_7 = arith.constant 1 : index
      %c1_8 = arith.constant 1 : index
      %c1_9 = arith.constant 1 : index
      %c1_10 = arith.constant 1 : index
      %c1_11 = arith.constant 1 : index
      %c1_12 = arith.constant 1 : index
      %c1_13 = arith.constant 1 : index
      %19 = scf.for %arg8 = %c0_3 to %c1_7 step %c1_10 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %20 = scf.for %arg10 = %c0_4 to %c1_8 step %c1_11 iter_args(%arg11 = %arg9) -> (tensor<1x1x1x?xf32>) {
          %21 = scf.for %arg12 = %c0_5 to %c1_9 step %c1_12 iter_args(%arg13 = %arg11) -> (tensor<1x1x1x?xf32>) {
            %22 = scf.for %arg14 = %c0_6 to %18 step %c1_13 iter_args(%arg15 = %arg13) -> (tensor<1x1x1x?xf32>) {
              %extracted_slice_14 = tensor.extract_slice %extracted_slice[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
              %extracted_slice_15 = tensor.extract_slice %arg15[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
              %23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_14 : tensor<1x1x1x1xf32>) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
              %inserted_slice = tensor.insert_slice %23 into %arg15[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
              scf.yield %inserted_slice : tensor<1x1x1x?xf32>
            }
            scf.yield %22 : tensor<1x1x1x?xf32>
          }
          scf.yield %21 : tensor<1x1x1x?xf32>
        }
        scf.yield %20 : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %31:3 = affine.delinearize_index %30 into (2, 32, 4) : index, index, index
            %32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%31#2)
            %33 = affine.min affine_map<(d0) -> (2, d0)>(%31#0)
            %34 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%33)
            %35 = arith.cmpi eq, %34, %c0 : index
            %36 = affine.min affine_map<(d0) -> (17, d0)>(%31#1)
            %37 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%36)
            %38 = arith.cmpi eq, %37, %c0 : index
            %39 = arith.ori %38, %35 : i1
            %40 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%37)
            %41 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %31#2)
            %42 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%41)
            %43 = arith.cmpi eq, %42, %c0 : index
            %44 = arith.ori %43, %39 : i1
            %45 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%42)
            %46 = scf.if %44 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_8 = tensor.extract_slice %3[%33, 0, 0, 0] [%34, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %48 = tensor.empty(%34, %37, %42) : tensor<?x1x?x?xf16>
              %49 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %36] * [17, 1] k_offset = [%41] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%48 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %49 low[0, 0, 0, 0] high[0, 0, %40, %45] {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %47 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%46 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %47 into %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %31:2 = affine.delinearize_index %30 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg12[%31#0, %31#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %32 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%31#0]
            %33 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%33)
            %36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%31#1, %arg1)
            %37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %36)
            %38 = arith.cmpi eq, %37, %c0 : index
            %39 = arith.ori %38, %34 : i1
            %40 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%37)
            %41 = scf.if %39 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %43 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
              %extracted_slice_8 = tensor.extract_slice %5[%32, %43] [%33, %37] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%35, %40] {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %42 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%41 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice = tensor.insert_slice %42 into %arg12[%31#0, %31#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %28 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %29 : tensor<1x1x1x1x4x1xf32>
        }
        %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %23 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
            %30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
            %31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
            %32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
            %33 = arith.cmpi eq, %32, %c0 : index
            %34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
            %35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %33 : i1
            %38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
            %39 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
            %40 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%39)
            %41 = arith.cmpi eq, %40, %c0 : index
            %42 = arith.ori %41, %37 : i1
            %43 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%40)
            %44 = scf.if %42 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_8 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %46 = tensor.empty(%32, %35, %40) : tensor<?x1x?x?xf16>
              %47 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %34] * [17, 1] k_offset = [%39] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%46 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %47 low[0, 0, 0, 0] high[0, 0, %38, %43] {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %45 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %24 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:2 = affine.delinearize_index %28 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%29#0]
            %31 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%30)
            %32 = arith.cmpi eq, %31, %c0 : index
            %33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
            %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%29#1, %arg1)
            %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %32 : i1
            %38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
            %39 = scf.if %37 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %41 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
              %extracted_slice_8 = tensor.extract_slice %5[%30, %41] [%31, %35] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%33, %38] {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice = tensor.insert_slice %40 into %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %24 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %26 = tensor.empty() : tensor<1x1x1x4xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %27 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %27 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
            %30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
            %31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
            %32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
            %33 = arith.cmpi eq, %32, %c0 : index
            %34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
            %35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %33 : i1
            %38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
            %39 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
            %40 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%39)
            %41 = arith.cmpi eq, %40, %c0 : index
            %42 = arith.ori %41, %37 : i1
            %43 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%40)
            %44 = scf.if %42 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_8 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %46 = tensor.empty(%32, %35, %40) : tensor<?x1x?x?xf16>
              %47 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %34] * [17, 1] k_offset = [%39] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%46 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %47 low[0, 0, 0, 0] high[0, 0, %38, %43] {
              ^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %45 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
          } : tensor<2x1x32x16xf16>
          %26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<16x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:2 = affine.delinearize_index %28 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%29#0]
            %31 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%30)
            %32 = arith.cmpi eq, %31, %c0 : index
            %33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
            %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%29#1, %arg1)
            %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %32 : i1
            %38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
            %39 = scf.if %37 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %41 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
              %extracted_slice_8 = tensor.extract_slice %5[%30, %41] [%31, %35] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_8 low[0, 0] high[%33, %38] {
              ^bb0(%arg13: index, %arg14: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice = tensor.insert_slice %40 into %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice : tensor<16x16xf16>
          } : tensor<16x16xf16>
          %expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %27 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %27 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CombineBarrierRegionsPass (iree-gpu-combine-barrier-regions) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %27 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %28:3 = affine.delinearize_index %27 into (2, 32, 4) : index, index, index
            %29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
            %30 = affine.min affine_map<(d0) -> (2, d0)>(%28#0)
            %31 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%30)
            %32 = arith.cmpi eq, %31, %c0 : index
            %33 = affine.min affine_map<(d0) -> (17, d0)>(%28#1)
            %34 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%33)
            %35 = arith.cmpi eq, %34, %c0 : index
            %36 = arith.ori %35, %32 : i1
            %37 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
            %38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %28#2)
            %39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
            %40 = arith.cmpi eq, %39, %c0 : index
            %41 = arith.ori %40, %36 : i1
            %42 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%39)
            %43 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_10 = tensor.extract_slice %3[%30, 0, 0, 0] [%31, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %58 = tensor.empty(%31, %34, %39) : tensor<?x1x?x?xf16>
              %59 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %33] * [17, 1] k_offset = [%38] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%58 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %59 low[0, 0, 0, 0] high[0, 0, %37, %42] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%28#0, 0, %28#1, %29] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %44 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%43 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %44 into %arg12[%28#0, 0, %28#1, %29] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %45 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %46:2 = affine.delinearize_index %45 into (16, 16) : index, index
            %extracted_slice_8 = tensor.extract_slice %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %47 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%46#0]
            %48 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%47)
            %49 = arith.cmpi eq, %48, %c0 : index
            %50 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%48)
            %51 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%46#1, %arg1)
            %52 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %51)
            %53 = arith.cmpi eq, %52, %c0 : index
            %54 = arith.ori %53, %49 : i1
            %55 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
            %56 = scf.if %54 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %58 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%51]
              %extracted_slice_10 = tensor.extract_slice %5[%47, %58] [%48, %52] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_10 low[0, 0] high[%50, %55] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %57 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%56 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_9 = tensor.insert_slice %57 into %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %26 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
          scf.yield %26 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After VectorizeIREEGPUOpsPass (iree-gpu-vectorize-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
            %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
            %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
            %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
            %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = arith.ori %39, %36 : i1
            %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
            %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
            %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
            %44 = arith.cmpi eq, %43, %c0 : index
            %45 = arith.ori %44, %40 : i1
            %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
            %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
              %63 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%62 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %50:2 = affine.delinearize_index %49 into (16, 16) : index, index
            %extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
            %52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
            %53 = arith.cmpi eq, %52, %c0 : index
            %54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
            %55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
            %56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
            %57 = arith.cmpi eq, %56, %c0 : index
            %58 = arith.ori %57, %53 : i1
            %59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
            %60 = scf.if %58 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
              %extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
          %28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %30 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
            %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
            %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
            %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
            %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = arith.ori %39, %36 : i1
            %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
            %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
            %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
            %44 = arith.cmpi eq, %43, %c0 : index
            %45 = arith.ori %44, %40 : i1
            %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
            %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
              %63 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%62 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
              %padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %50:2 = affine.delinearize_index %49 into (16, 16) : index, index
            %extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
            %52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
            %53 = arith.cmpi eq, %52, %c0 : index
            %54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
            %55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
            %56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
            %57 = arith.cmpi eq, %56, %c0 : index
            %58 = arith.ori %57, %53 : i1
            %59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
            %60 = scf.if %58 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
              %extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
          %28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %30 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
            %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
            %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
            %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
            %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = arith.ori %39, %36 : i1
            %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
            %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
            %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
            %44 = arith.cmpi eq, %43, %c0 : index
            %45 = arith.ori %44, %40 : i1
            %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
            %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
              %63 = scf.for %arg14 = %c0 to %35 step %c1 iter_args(%arg15 = %62) -> (tensor<?x1x?x?xf16>) {
                %64 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %65 = scf.for %arg18 = %c0 to %38 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %66 = scf.for %arg20 = %c0 to %43 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
                      %67 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%42, %arg20)
                      %68 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
                      %69 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
                      %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[%arg14, %68, %69, %67] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                      %extracted_slice_12 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                      %70 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                      %inserted_slice_13 = tensor.insert_slice %70 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                      scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
                    }
                    scf.yield %66 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %65 : tensor<?x1x?x?xf16>
                }
                scf.yield %64 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %50:2 = affine.delinearize_index %49 into (16, 16) : index, index
            %extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
            %52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
            %53 = arith.cmpi eq, %52, %c0 : index
            %54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
            %55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
            %56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
            %57 = arith.cmpi eq, %56, %c0 : index
            %58 = arith.ori %57, %53 : i1
            %59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
            %60 = scf.if %58 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
              %extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
          %28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %30 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f16
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
        %22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
        %23 = tensor.empty() : tensor<1x1x1x4xf16>
        %24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
            %33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
            %34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
            %35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
            %38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = arith.ori %39, %36 : i1
            %41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
            %42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
            %43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
            %44 = arith.cmpi eq, %43, %c0 : index
            %45 = arith.ori %44, %40 : i1
            %46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
            %47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
              %63 = scf.for %arg14 = %c0 to %35 step %c1 iter_args(%arg15 = %62) -> (tensor<?x1x?x?xf16>) {
                %64 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %65 = scf.for %arg18 = %c0 to %38 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %66 = scf.for %arg20 = %c0 to %43 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
                      %67 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%42, %arg20)
                      %68 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
                      %69 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
                      %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[%arg14, %68, %69, %67] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                      %extracted_slice_12 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                      %70 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                      %inserted_slice_13 = tensor.insert_slice %70 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                      scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
                    }
                    scf.yield %66 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %65 : tensor<?x1x?x?xf16>
                }
                scf.yield %64 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %50:2 = affine.delinearize_index %49 into (16, 16) : index, index
            %extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
            %52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
            %53 = arith.cmpi eq, %52, %c0 : index
            %54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
            %55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
            %56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
            %57 = arith.cmpi eq, %56, %c0 : index
            %58 = arith.ori %57, %53 : i1
            %59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
            %60 = scf.if %58 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
              %extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5] 
          %expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1] 
          %26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
          %28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %30 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5] 
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %cst_1 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
            %36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
            %37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
            %38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
            %41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %39 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
            %45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
            %46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
            %47 = arith.cmpi eq, %46, %c0 : index
            %48 = arith.ori %47, %43 : i1
            %49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
            %50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %65 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
              %66 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %65) -> (tensor<?x1x?x?xf16>) {
                %67 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %68 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %69 = scf.for %arg20 = %c0 to %46 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
                      %70 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg20)
                      %71 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %40, %45, %arg20)
                      %72 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %40, %45, %arg20)
                      %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %71, %72, %70] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                      %extracted_slice_11 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                      %73 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                      %inserted_slice_12 = tensor.insert_slice %73 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                      scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
                    }
                    scf.yield %69 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %68 : tensor<?x1x?x?xf16>
                }
                scf.yield %67 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %66 low[0, 0, 0, 0] high[0, 0, %44, %49] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %52 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %53:2 = affine.delinearize_index %52 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %54 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%53#0]
            %55 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%54)
            %56 = arith.cmpi eq, %55, %c0 : index
            %57 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%55)
            %58 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%53#1, %arg1)
            %59 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %58)
            %60 = arith.cmpi eq, %59, %c0 : index
            %61 = arith.ori %60, %56 : i1
            %62 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%59)
            %63 = scf.if %61 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %65 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%58]
              %extracted_slice_9 = tensor.extract_slice %5[%54, %65] [%55, %59] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_9 low[0, 0] high[%57, %62] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %64 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%63 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_8 = tensor.insert_slice %64 into %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
          %30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %33 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
        %24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %cst_1 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
            %36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
            %37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
            %38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
            %41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %39 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
            %45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
            %46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
            %47 = arith.cmpi eq, %46, %c0 : index
            %48 = arith.ori %47, %43 : i1
            %49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
            %50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %65 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
              %66 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %65) -> (tensor<?x1x?x?xf16>) {
                %67 = scf.for %arg16 = %c0 to %41 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %68 = scf.for %arg18 = %c0 to %46 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %69 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg18)
                    %70 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %40, %45, %arg18)
                    %71 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %40, %45, %arg18)
                    %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %70, %71, %69] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_11 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %72 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_12 = tensor.insert_slice %72 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %68 : tensor<?x1x?x?xf16>
                }
                scf.yield %67 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %66 low[0, 0, 0, 0] high[0, 0, %44, %49] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %52 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %53:2 = affine.delinearize_index %52 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %54 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%53#0]
            %55 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%54)
            %56 = arith.cmpi eq, %55, %c0 : index
            %57 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%55)
            %58 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%53#1, %arg1)
            %59 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %58)
            %60 = arith.cmpi eq, %59, %c0 : index
            %61 = arith.ori %60, %56 : i1
            %62 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%59)
            %63 = scf.if %61 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %65 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%58]
              %extracted_slice_9 = tensor.extract_slice %5[%54, %65] [%55, %59] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_9 low[0, 0] high[%57, %62] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %64 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%63 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_8 = tensor.insert_slice %64 into %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
          %30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %33 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
        %24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %cst_1 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
        %21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
        %22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
          %26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
            %36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
            %37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
            %38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
            %39 = arith.cmpi eq, %38, %c0 : index
            %40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
            %41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %39 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
            %45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
            %46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
            %47 = arith.cmpi eq, %46, %c0 : index
            %48 = arith.ori %47, %43 : i1
            %49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
            %50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %64 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
              %65 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %64) -> (tensor<?x1x?x?xf16>) {
                %66 = scf.for %arg16 = %c0 to %41 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %67 = scf.for %arg18 = %c0 to %46 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %68 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg18)
                    %69 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %40, %45, %arg18)
                    %70 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %40, %45, %arg18)
                    %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %69, %70, %68] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_11 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %71 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_12 = tensor.insert_slice %71 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %67 : tensor<?x1x?x?xf16>
                }
                scf.yield %66 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %65 low[0, 0, 0, 0] high[0, 0, %44, %49] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %52:2 = affine.delinearize_index %34 into (16, 16) : index, index
            %extracted_slice_7 = tensor.extract_slice %arg13[%52#0, %52#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %53 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%52#0]
            %54 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%53)
            %55 = arith.cmpi eq, %54, %c0 : index
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%54)
            %57 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%52#1, %arg1)
            %58 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %57)
            %59 = arith.cmpi eq, %58, %c0 : index
            %60 = arith.ori %59, %55 : i1
            %61 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%58)
            %62 = scf.if %60 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %64 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%57]
              %extracted_slice_9 = tensor.extract_slice %5[%53, %64] [%54, %58] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_9 low[0, 0] high[%56, %61] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %63 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%62 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_8 = tensor.insert_slice %63 into %arg13[%52#0, %52#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
          %27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
          %28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
          %29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
          %30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
          %32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          %33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
          scf.yield %33 : tensor<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
        %24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
            %31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
            %32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
            %33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
            %36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
            %37 = arith.cmpi eq, %36, %c0 : index
            %38 = arith.ori %37, %34 : i1
            %39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %38 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
            %45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
              %60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
                %61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
                    %65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
                    %extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %62 : tensor<?x1x?x?xf16>
                }
                scf.yield %61 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %47:2 = affine.delinearize_index %29 into (16, 16) : index, index
            %extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
            %49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
            %52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
            %53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
            %54 = arith.cmpi eq, %53, %c0 : index
            %55 = arith.ori %54, %50 : i1
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
            %57 = scf.if %55 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
              %extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %28 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
            %31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
            %32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
            %33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
            %36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
            %37 = arith.cmpi eq, %36, %c0 : index
            %38 = arith.ori %37, %34 : i1
            %39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %38 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
            %45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
              %60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
                %61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
                    %65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
                    %extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %62 : tensor<?x1x?x?xf16>
                }
                scf.yield %61 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %47:2 = affine.delinearize_index %29 into (16, 16) : index, index
            %extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
            %49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
            %52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
            %53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
            %54 = arith.cmpi eq, %53, %c0 : index
            %55 = arith.ori %54, %50 : i1
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
            %57 = scf.if %55 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
              %extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %28 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
            %31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
            %32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
            %33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
            %36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
            %37 = arith.cmpi eq, %36, %c0 : index
            %38 = arith.ori %37, %34 : i1
            %39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %38 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
            %45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
              %60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
                %61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
                    %65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
                    %extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %62 : tensor<?x1x?x?xf16>
                }
                scf.yield %61 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %47:2 = affine.delinearize_index %29 into (16, 16) : index, index
            %extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
            %49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
            %52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
            %53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
            %54 = arith.cmpi eq, %53, %c0 : index
            %55 = arith.ori %54, %50 : i1
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
            %57 = scf.if %55 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
              %extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %28 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
            %31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
            %32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
            %33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
            %36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
            %37 = arith.cmpi eq, %36, %c0 : index
            %38 = arith.ori %37, %34 : i1
            %39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %38 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
            %45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
              %60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
                %61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
                    %65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
                    %extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %62 : tensor<?x1x?x?xf16>
                }
                scf.yield %61 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %47:2 = affine.delinearize_index %29 into (16, 16) : index, index
            %extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
            %49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
            %52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
            %53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
            %54 = arith.cmpi eq, %53, %c0 : index
            %55 = arith.ori %54, %50 : i1
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
            %57 = scf.if %55 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
              %extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %28 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = tensor.empty() : tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
        %18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
            %31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
            %32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
            %33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
            %34 = arith.cmpi eq, %33, %c0 : index
            %35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
            %36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
            %37 = arith.cmpi eq, %36, %c0 : index
            %38 = arith.ori %37, %34 : i1
            %39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %38 : i1
            %44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
            %45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1x1x4xf16>
              scf.yield %generated : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
              %60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
                %61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
                    %65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
                    %extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %62 : tensor<?x1x?x?xf16>
                }
                scf.yield %61 : tensor<?x1x?x?xf16>
              }
              %padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
              ^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
              scf.yield %padded : tensor<1x1x1x4xf16>
            }
            %extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %47:2 = affine.delinearize_index %29 into (16, 16) : index, index
            %extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
            %49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
            %52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
            %53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
            %54 = arith.cmpi eq, %53, %c0 : index
            %55 = arith.ori %54, %50 : i1
            %56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
            %57 = scf.if %55 -> (tensor<1x1xf16>) {
              %generated = tensor.generate  {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<1x1xf16>
              scf.yield %generated : tensor<1x1xf16>
            } else {
              %59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
              %extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
              ^bb0(%arg14: index, %arg15: index):
                tensor.yield %cst_0 : f16
              } : tensor<?x?xf16> to tensor<1x1xf16>
              scf.yield %padded : tensor<1x1xf16>
            }
            %58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %28 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
    %15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
  %5 = tensor.empty() : tensor<2x17x17x1281xf32>
  %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %7 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %8 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %9 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
    %13 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %12) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %18 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
        %19 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %20 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %21 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %24:2 = iree_gpu.barrier_region ins(%10, %11 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %31:3 = affine.delinearize_index %30 into (2, 32, 4) : index, index, index
            %32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%31#2)
            %33 = affine.min affine_map<(d0) -> (2, d0)>(%31#0)
            %34 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%33)
            %35 = arith.cmpi eq, %34, %c0 : index
            %36 = affine.min affine_map<(d0) -> (17, d0)>(%31#1)
            %37 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%36)
            %38 = arith.cmpi eq, %37, %c0 : index
            %39 = arith.ori %38, %35 : i1
            %40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %31#2)
            %41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
            %42 = arith.cmpi eq, %41, %c0 : index
            %43 = arith.ori %42, %39 : i1
            %44 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
              %56 = tensor.empty() : tensor<1x1x1x4xf16>
              %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%56 : tensor<1x1x1x4xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1x1x4xf16>
              scf.yield %57 : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_7 = tensor.extract_slice %3[%33, 0, 0, 0] [%34, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %56 = tensor.empty(%34, %37, %41) : tensor<?x1x?x?xf16>
              %57 = scf.for %arg14 = %c0 to %34 step %c1 iter_args(%arg15 = %56) -> (tensor<?x1x?x?xf16>) {
                %60 = scf.for %arg16 = %c0 to %37 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %61 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %62 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
                    %63 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %36, %40, %arg18)
                    %64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %36, %40, %arg18)
                    %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %63, %64, %62] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %65 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_13 = tensor.insert_slice %65 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %61 : tensor<?x1x?x?xf16>
                }
                scf.yield %60 : tensor<?x1x?x?xf16>
              }
              %58 = tensor.empty() : tensor<1x1x1x4xf16>
              %59 = linalg.fill ins(%cst_0 : f16) outs(%58 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %dim = tensor.dim %57, %c0 : tensor<?x1x?x?xf16>
              %dim_8 = tensor.dim %57, %c2 : tensor<?x1x?x?xf16>
              %dim_9 = tensor.dim %57, %c3 : tensor<?x1x?x?xf16>
              %inserted_slice_10 = tensor.insert_slice %57 into %59[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
              scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
            }
            %extracted_slice_4 = tensor.extract_slice %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %45 into %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %46:2 = affine.delinearize_index %30 into (16, 16) : index, index
            %extracted_slice_5 = tensor.extract_slice %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %47 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%46#0]
            %48 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%47)
            %49 = arith.cmpi eq, %48, %c0 : index
            %50 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%46#1, %arg1)
            %51 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%9, %50)
            %52 = arith.cmpi eq, %51, %c0 : index
            %53 = arith.ori %52, %49 : i1
            %54 = scf.if %53 -> (tensor<1x1xf16>) {
              %56 = tensor.empty() : tensor<1x1xf16>
              %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1xf16>
              scf.yield %57 : tensor<1x1xf16>
            } else {
              %56 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%50]
              %extracted_slice_7 = tensor.extract_slice %6[%47, %56] [%48, %51] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %57 = tensor.empty() : tensor<1x1xf16>
              %58 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %58[0, 0] [%48, %51] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
              scf.yield %inserted_slice_8 : tensor<1x1xf16>
            }
            %55 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%54 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_6 = tensor.insert_slice %55 into %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %24#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %25 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %19, %c0, %20], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %26 = vector.transpose %25, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_3 = tensor.expand_shape %24#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %27 = vector.transfer_read %expanded_3[%c0, %20, %c0, %19], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %28 = vector.transpose %27, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %29 = iree_gpu.multi_mma %26, %28, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %29 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %20, 0, %19] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %22 = vector.transpose %21, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %23 = vector.transfer_write %22, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg9[0, 0, 0, %20, 0, %19] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %13 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %8] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %15 = tensor.empty(%9) : tensor<2x1x17x?xf32>
    %16 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%9)
    %17 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %16) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
      %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %19 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%9]
      %extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %20 = scf.for %arg8 = %c0 to %19 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %21 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg7[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %8] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = bufferization.alloc_tensor() : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %16 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
        %17 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %19 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %22:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
            %30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
            %31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
            %32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
            %33 = arith.cmpi eq, %32, %c0 : index
            %34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
            %35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %33 : i1
            %38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
            %39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
            %40 = arith.cmpi eq, %39, %c0 : index
            %41 = arith.ori %40, %37 : i1
            %42 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
              %54 = bufferization.alloc_tensor() : tensor<1x1x1x4xf16>
              %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%54 : tensor<1x1x1x4xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1x1x4xf16>
              scf.yield %55 : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_7 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %54 = bufferization.alloc_tensor(%32, %35, %39) : tensor<?x1x?x?xf16>
              %55 = scf.for %arg14 = %c0 to %32 step %c1 iter_args(%arg15 = %54) -> (tensor<?x1x?x?xf16>) {
                %58 = scf.for %arg16 = %c0 to %35 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %59 = scf.for %arg18 = %c0 to %39 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %60 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%38, %arg18)
                    %61 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %34, %38, %arg18)
                    %62 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %34, %38, %arg18)
                    %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %61, %62, %60] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %63 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_13 = tensor.insert_slice %63 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %59 : tensor<?x1x?x?xf16>
                }
                scf.yield %58 : tensor<?x1x?x?xf16>
              }
              %56 = bufferization.alloc_tensor() : tensor<1x1x1x4xf16>
              %57 = linalg.fill ins(%cst_0 : f16) outs(%56 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %dim = tensor.dim %55, %c0 : tensor<?x1x?x?xf16>
              %dim_8 = tensor.dim %55, %c2 : tensor<?x1x?x?xf16>
              %dim_9 = tensor.dim %55, %c3 : tensor<?x1x?x?xf16>
              %inserted_slice_10 = tensor.insert_slice %55 into %57[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
              scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
            }
            %extracted_slice_4 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %43 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %44:2 = affine.delinearize_index %28 into (16, 16) : index, index
            %extracted_slice_5 = tensor.extract_slice %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %45 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%44#0]
            %46 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%45)
            %47 = arith.cmpi eq, %46, %c0 : index
            %48 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%44#1, %arg1)
            %49 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = arith.ori %50, %47 : i1
            %52 = scf.if %51 -> (tensor<1x1xf16>) {
              %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1xf16>
              scf.yield %54 : tensor<1x1xf16>
            } else {
              %54 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%48]
              %extracted_slice_7 = tensor.extract_slice %5[%45, %54] [%46, %49] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %55 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %55[0, 0] [%46, %49] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
              scf.yield %inserted_slice_8 : tensor<1x1xf16>
            }
            %53 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%52 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_6 = tensor.insert_slice %53 into %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %22#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %23 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %17, %c0, %18], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_3 = tensor.expand_shape %22#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %25 = vector.transfer_read %expanded_3[%c0, %18, %c0, %17], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %26 = vector.transpose %25, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %27 = iree_gpu.multi_mma %24, %26, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %27 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %20 = vector.transpose %19, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %21 = vector.transfer_write %20, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %21 into %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %14 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %15 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %14) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %17 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %18 = scf.for %arg8 = %c0 to %17 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %19 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After GPUInferMemorySpacePass (iree-codegen-gpu-infer-memory-space) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
  %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
  %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
  %6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
    %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
    %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x2x16x1x16xf32>
    %12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
      %extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
      %16 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
        %17 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
        %18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
        %19 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          %22:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
          ^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
            %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
            %29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
            %30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
            %31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
            %32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
            %33 = arith.cmpi eq, %32, %c0 : index
            %34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
            %35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
            %36 = arith.cmpi eq, %35, %c0 : index
            %37 = arith.ori %36, %33 : i1
            %38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
            %39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
            %40 = arith.cmpi eq, %39, %c0 : index
            %41 = arith.ori %40, %37 : i1
            %42 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
              %54 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} : tensor<1x1x1x4xf16>
              %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%54 : tensor<1x1x1x4xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1x1x4xf16>
              scf.yield %55 : tensor<1x1x1x4xf16>
            } else {
              %extracted_slice_7 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
              %54 = bufferization.alloc_tensor(%32, %35, %39) {memory_space = #gpu.address_space<private>} : tensor<?x1x?x?xf16>
              %55 = scf.for %arg14 = %c0 to %32 step %c1 iter_args(%arg15 = %54) -> (tensor<?x1x?x?xf16>) {
                %58 = scf.for %arg16 = %c0 to %35 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
                  %59 = scf.for %arg18 = %c0 to %39 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
                    %60 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%38, %arg18)
                    %61 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %34, %38, %arg18)
                    %62 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %34, %38, %arg18)
                    %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %61, %62, %60] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
                    %extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
                    %63 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
                    %inserted_slice_13 = tensor.insert_slice %63 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
                    scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
                  }
                  scf.yield %59 : tensor<?x1x?x?xf16>
                }
                scf.yield %58 : tensor<?x1x?x?xf16>
              }
              %56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} : tensor<1x1x1x4xf16>
              %57 = linalg.fill ins(%cst_0 : f16) outs(%56 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
              %dim = tensor.dim %55, %c0 : tensor<?x1x?x?xf16>
              %dim_8 = tensor.dim %55, %c2 : tensor<?x1x?x?xf16>
              %dim_9 = tensor.dim %55, %c3 : tensor<?x1x?x?xf16>
              %inserted_slice_10 = tensor.insert_slice %55 into %57[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
              scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
            }
            %extracted_slice_4 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
            %43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
            %inserted_slice = tensor.insert_slice %43 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
            %44:2 = affine.delinearize_index %28 into (16, 16) : index, index
            %extracted_slice_5 = tensor.extract_slice %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
            %45 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%44#0]
            %46 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%45)
            %47 = arith.cmpi eq, %46, %c0 : index
            %48 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%44#1, %arg1)
            %49 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %48)
            %50 = arith.cmpi eq, %49, %c0 : index
            %51 = arith.ori %50, %47 : i1
            %52 = scf.if %51 -> (tensor<1x1xf16>) {
              %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
              ^bb0(%out: f16):
                linalg.yield %cst_0 : f16
              } -> tensor<1x1xf16>
              scf.yield %54 : tensor<1x1xf16>
            } else {
              %54 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%48]
              %extracted_slice_7 = tensor.extract_slice %5[%45, %54] [%46, %49] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
              %55 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
              %inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %55[0, 0] [%46, %49] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
              scf.yield %inserted_slice_8 : tensor<1x1xf16>
            }
            %53 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%52 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
            %inserted_slice_6 = tensor.insert_slice %53 into %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
            iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          } : tensor<2x1x32x16xf16>, tensor<16x16xf16>
          %expanded = tensor.expand_shape %22#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
          %23 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %17, %c0, %18], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
          %24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expanded_3 = tensor.expand_shape %22#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
          %25 = vector.transfer_read %expanded_3[%c0, %18, %c0, %17], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
          %26 = vector.transpose %25, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %27 = iree_gpu.multi_mma %24, %26, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %27 : vector<1x1x1x1x4x1xf32>
        }
        %extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
        %20 = vector.transpose %19, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        %21 = vector.transfer_write %20, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %21 into %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
        }
      } {mapping = [#iree_gpu.lane_id<0>]}
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
      }
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
    %collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
    %extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
    %14 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
    %15 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %14) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
      %17 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
      %extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
      %extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
      %18 = scf.for %arg8 = %c0 to %17 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
        %extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
        %19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
        %inserted_slice = tensor.insert_slice %19 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
        scf.yield %inserted_slice : tensor<1x1x1x?xf32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %15 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
  return
 }

 // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %39 = scf.for %arg9 = %c0 to %14 step %c1 iter_args(%arg10 = %alloca) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
              %40 = scf.for %arg11 = %c0 to %17 step %c1 iter_args(%arg12 = %arg10) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
                %41 = scf.for %arg13 = %c0 to %21 step %c1 iter_args(%arg14 = %arg12) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
                  %42 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg13)
                  %43 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg11, %arg0, %16, %20, %arg13)
                  %44 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg11, %arg0, %16, %20, %arg13)
                  %subview_18 = memref.subview %subview_13[%arg9, %43, %44, %42] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_19 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                  %subview_20 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  memref.copy %subview_19, %subview_20 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  scf.yield %arg14 : memref<?x1x?x?xf16, #gpu.address_space<private>>
                }
                scf.yield %41 : memref<?x1x?x?xf16, #gpu.address_space<private>>
              }
              scf.yield %40 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            }
            %alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %dim = memref.dim %39, %c0 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %dim_15 = memref.dim %39, %c2 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %dim_16 = memref.dim %39, %c3 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %subview_17 = memref.subview %alloca_14[0, 0, 0, 0] [%dim, 1, %dim_15, %dim_16] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %39, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          %33 = scf.if %32 -> (memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          } else {
            %39 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_13 = memref.subview %1[%26, %39] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %34 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %35 = vector.transpose %34, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %36 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %37 = vector.transpose %36, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %38 = iree_gpu.multi_mma %35, %37, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %38 : vector<1x1x1x1x4x1xf32>
        }
        %subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
      %subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %8 = scf.for %arg6 = %c0 to %7 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %subview_7 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        %subview_9 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        scf.yield %arg7 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %8, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %39 = scf.for %arg9 = %c0 to %14 step %c1 iter_args(%arg10 = %alloca) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
              %40 = scf.for %arg11 = %c0 to %17 step %c1 iter_args(%arg12 = %arg10) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
                %41 = scf.for %arg13 = %c0 to %21 step %c1 iter_args(%arg14 = %arg12) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
                  %42 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg13)
                  %43 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg11, %arg0, %16, %20, %arg13)
                  %44 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg11, %arg0, %16, %20, %arg13)
                  %subview_18 = memref.subview %subview_13[%arg9, %43, %44, %42] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_19 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                  %subview_20 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  memref.copy %subview_19, %subview_20 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  scf.yield %arg14 : memref<?x1x?x?xf16, #gpu.address_space<private>>
                }
                scf.yield %41 : memref<?x1x?x?xf16, #gpu.address_space<private>>
              }
              scf.yield %40 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            }
            %alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %dim = memref.dim %39, %c0 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %dim_15 = memref.dim %39, %c2 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %dim_16 = memref.dim %39, %c3 : memref<?x1x?x?xf16, #gpu.address_space<private>>
            %subview_17 = memref.subview %alloca_14[0, 0, 0, 0] [%dim, 1, %dim_15, %dim_16] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %39, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          %33 = scf.if %32 -> (memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          } else {
            %39 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_13 = memref.subview %1[%26, %39] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %34 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %35 = vector.transpose %34, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %36 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %37 = vector.transpose %36, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %38 = iree_gpu.multi_mma %35, %37, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %38 : vector<1x1x1x1x4x1xf32>
        }
        %subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
      %subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %8 = scf.for %arg6 = %c0 to %7 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
        %subview_7 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        %subview_9 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        scf.yield %arg7 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %8, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_16 = memref.subview %subview_13[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_17 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_16 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_17 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                  %subview_18 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  memref.copy %subview_17, %subview_18 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                }
              }
            }
            %alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_15 = memref.subview %alloca_14[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_15 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_13 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          %subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
      %subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_7 = memref.subview %subview_5[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        %subview_9 = memref.subview %subview_5[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      %subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview_5, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                  memref.copy %subview_12, %subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          memref.copy %subview_5, %subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          memref.copy %subview_6, %subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_4 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
      memref.copy %subview_3, %subview_3 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.copy %subview_5, %subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
      memref.copy %subview_4, %subview_4 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    memref.copy %subview, %subview : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUVerifyDistributionPass (iree-codegen-gpu-verify-distribution) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
      %subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      scf.forall (%arg6) in (64) {
        %6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
        %7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
        %8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
          gpu.barrier
          %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
          %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
          %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
          %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
          %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
          %15 = arith.cmpi eq, %14, %c0 : index
          %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
          %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
          %18 = arith.cmpi eq, %17, %c0 : index
          %19 = arith.ori %18, %15 : i1
          %20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
          %21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
          %22 = arith.cmpi eq, %21, %c0 : index
          %23 = arith.ori %22, %19 : i1
          %24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
            scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
          } else {
            %subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
            scf.for %arg9 = %c0 to %14 step %c1 {
              scf.for %arg10 = %c0 to %17 step %c1 {
                scf.for %arg11 = %c0 to %21 step %c1 {
                  %38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
                  %39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
                  %40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
                  %subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                  %subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                  linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
                }
              }
            }
            %alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
            linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
            %subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
            scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
          }
          %subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %25:2 = affine.delinearize_index %10 into (16, 16) : index, index
          %subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
          %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
          %28 = arith.cmpi eq, %27, %c0 : index
          %29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
          %30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
          %31 = arith.cmpi eq, %30, %c0 : index
          %32 = arith.ori %31, %28 : i1
          scf.if %32 {
            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
            ^bb0(%out: f16):
              linalg.yield %cst_0 : f16
            }
          } else {
            %38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
            %subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
            %subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
            memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          }
          gpu.barrier
          %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
          %33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
          %34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
          %expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
          %35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
          %36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
          %37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
          scf.yield %37 : vector<1x1x1x1x4x1xf32>
        }
        %subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
        vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {mapping = [#iree_gpu.lane_id<0>]}
    } {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
      %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
      %7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
      %subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg6 = %c0 to %7 step %c1 {
        %subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    } {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUDistributeForallPass (iree-codegen-gpu-distribute-forall) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %thread_id_z = gpu.thread_id  z
  %thread_id_y = gpu.thread_id  y
  %thread_id_x = gpu.thread_id  x
  %0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
  %cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %c721 = arith.constant 721 : index
  %c1 = arith.constant 1 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c0 = arith.constant 0 : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %6:2 = affine.delinearize_index %0 into (4, 64) : index, index
    %c2 = arith.constant 2 : index
    %c1_3 = arith.constant 1 : index
    %c2_4 = arith.constant 2 : index
    %c1_5 = arith.constant 1 : index
    %c0_6 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %c4_7 = arith.constant 4 : index
    gpu.barrier
    scf.for %arg2 = %c0_6 to %c4 step %c4_7 {
      %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
      %10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
      %subview_10 = memref.subview %alloc_2[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %11 = gpu.lane_id
      %12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
      %13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
      %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
        gpu.barrier
        %16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
        %17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
        %19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
        %20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
        %21 = arith.cmpi eq, %20, %c0 : index
        %22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
        %23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = arith.ori %24, %21 : i1
        %26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
        %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
          scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
        } else {
          %subview_15 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
          scf.for %arg5 = %c0 to %20 step %c1 {
            scf.for %arg6 = %c0 to %23 step %c1 {
              scf.for %arg7 = %c0 to %27 step %c1 {
                %44 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
                %45 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
                %46 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
                %subview_18 = memref.subview %subview_15[%arg5, %45, %46, %44] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                %subview_19 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
              }
            }
          }
          %alloca_16 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.fill ins(%cst_0 : f16) outs(%alloca_16 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
          %subview_17 = memref.subview %alloca_16[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          memref.copy %alloca, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          scf.yield %alloca_16 : memref<1x1x1x4xf16, #gpu.address_space<private>>
        }
        %subview_12 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%30 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_12 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %31:2 = affine.delinearize_index %16 into (16, 16) : index, index
        %subview_13 = memref.subview %alloc_1[%31#0, %31#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %32 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%31#0]
        %33 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%32)
        %34 = arith.cmpi eq, %33, %c0 : index
        %35 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%31#1, %arg1)
        %36 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %35)
        %37 = arith.cmpi eq, %36, %c0 : index
        %38 = arith.ori %37, %34 : i1
        scf.if %38 {
          linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_13 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
        } else {
          %44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%35]
          %subview_15 = memref.subview %2[%32, %44] [%33, %36] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.fill ins(%cst_0 : f16) outs(%subview_13 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_16 = memref.subview %subview_13[0, 0] [%33, %36] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_15, %subview_16 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        }
        gpu.barrier
        %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
        %39 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
        %40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
        %expand_shape_14 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
        %41 = vector.transfer_read %expand_shape_14[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
        %42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
        %43 = iree_gpu.multi_mma %40, %42, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
        scf.yield %43 : vector<1x1x1x1x4x1xf32>
      }
      %subview_11 = memref.subview %subview_10[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
      vector.transfer_write %15, %subview_11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
    %c2_8 = arith.constant 2 : index
    %c1_9 = arith.constant 1 : index
    %c17 = arith.constant 17 : index
    %8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
    %c256 = arith.constant 256 : index
    gpu.barrier
    scf.for %arg2 = %0 to %8 step %c256 {
      %9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
      %11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
      %subview_10 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_11 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %11 step %c1 {
        %subview_12 = memref.subview %subview_11[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_13 = memref.subview %subview_10[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_13 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_12 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After VectorizeMemrefCopyPass (iree-codegen-vectorize-memref-copy) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_z = gpu.thread_id  z
  %thread_id_y = gpu.thread_id  y
  %thread_id_x = gpu.thread_id  x
  %0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %6:2 = affine.delinearize_index %0 into (4, 64) : index, index
    gpu.barrier
    scf.for %arg2 = %c0 to %c4 step %c4 {
      %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
      %10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
      %subview_4 = memref.subview %alloc_3[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %11 = gpu.lane_id
      %12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
      %13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
      %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
        gpu.barrier
        %16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
        %17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
        %19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
        %20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
        %21 = arith.cmpi eq, %20, %c0 : index
        %22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
        %23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = arith.ori %24, %21 : i1
        %26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
        %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
          scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
        } else {
          %subview_9 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
          scf.for %arg5 = %c0 to %20 step %c1 {
            scf.for %arg6 = %c0 to %23 step %c1 {
              scf.for %arg7 = %c0 to %27 step %c1 {
                %45 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
                %46 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
                %47 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
                %subview_12 = memref.subview %subview_9[%arg5, %46, %47, %45] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                %subview_13 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                %48 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
                vector.transfer_write %48, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              }
            }
          }
          %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
          %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
        }
        %subview_6 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
        vector.transfer_write %31, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %32:2 = affine.delinearize_index %16 into (16, 16) : index, index
        %subview_7 = memref.subview %alloc_2[%32#0, %32#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%32#0]
        %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
        %35 = arith.cmpi eq, %34, %c0 : index
        %36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
        %37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %36)
        %38 = arith.cmpi eq, %37, %c0 : index
        %39 = arith.ori %38, %35 : i1
        scf.if %39 {
          linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
        } else {
          %45 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
          %subview_9 = memref.subview %2[%33, %45] [%34, %37] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_10 = memref.subview %subview_7[0, 0] [%34, %37] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        }
        gpu.barrier
        %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
        %40 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
        %41 = vector.transpose %40, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
        %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
        %42 = vector.transfer_read %expand_shape_8[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
        %43 = vector.transpose %42, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
        %44 = iree_gpu.multi_mma %41, %43, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
        scf.yield %44 : vector<1x1x1x1x4x1xf32>
      }
      %subview_5 = memref.subview %subview_4[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
      vector.transfer_write %15, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
    %8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
    gpu.barrier
    scf.for %arg2 = %0 to %8 step %c256 {
      %9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
      %11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
      %subview_4 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %11 step %c1 {
        %subview_6 = memref.subview %subview_5[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %subview_4[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %12 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %12, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After UnrollToIntrinsicsPass (iree-gpu-unroll-to-intrinsics) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_z = gpu.thread_id  z
  %thread_id_y = gpu.thread_id  y
  %thread_id_x = gpu.thread_id  x
  %0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %6:2 = affine.delinearize_index %0 into (4, 64) : index, index
    gpu.barrier
    scf.for %arg2 = %c0 to %c4 step %c4 {
      %9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
      %10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
      %subview_4 = memref.subview %alloc_3[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %11 = gpu.lane_id
      %12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
      %13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
      %14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
        gpu.barrier
        %16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
        %17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
        %19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
        %20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
        %21 = arith.cmpi eq, %20, %c0 : index
        %22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
        %23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
        %24 = arith.cmpi eq, %23, %c0 : index
        %25 = arith.ori %24, %21 : i1
        %26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
        %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
        %28 = arith.cmpi eq, %27, %c0 : index
        %29 = arith.ori %28, %25 : i1
        %30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
          scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
        } else {
          %subview_9 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          %alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
          scf.for %arg5 = %c0 to %20 step %c1 {
            scf.for %arg6 = %c0 to %23 step %c1 {
              scf.for %arg7 = %c0 to %27 step %c1 {
                %49 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
                %50 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
                %51 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
                %subview_12 = memref.subview %subview_9[%arg5, %50, %51, %49] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
                %subview_13 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
                %52 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
                vector.transfer_write %52, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              }
            }
          }
          %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
          linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
          %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
          scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
        }
        %subview_6 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
        vector.transfer_write %31, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %32:2 = affine.delinearize_index %16 into (16, 16) : index, index
        %subview_7 = memref.subview %alloc_2[%32#0, %32#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%32#0]
        %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
        %35 = arith.cmpi eq, %34, %c0 : index
        %36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
        %37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %36)
        %38 = arith.cmpi eq, %37, %c0 : index
        %39 = arith.ori %38, %35 : i1
        scf.if %39 {
          linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
          ^bb0(%out: f16):
            linalg.yield %cst_0 : f16
          }
        } else {
          %49 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
          %subview_9 = memref.subview %2[%33, %49] [%34, %37] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
          linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
          %subview_10 = memref.subview %subview_7[0, 0] [%34, %37] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        }
        gpu.barrier
        %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
        %40 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
        %41 = vector.transpose %40, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
        %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
        %42 = vector.transfer_read %expand_shape_8[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
        %43 = vector.transpose %42, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
        %44 = vector.extract %41[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
        %45 = vector.extract %43[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
        %46 = vector.extract %arg4[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
        %47 = iree_gpu.multi_mma %44, %45, %46 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
        %48 = vector.broadcast %47 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
        scf.yield %48 : vector<1x1x1x1x4x1xf32>
      }
      %subview_5 = memref.subview %subview_4[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
      vector.transfer_write %15, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
    %8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
    gpu.barrier
    scf.for %arg2 = %0 to %8 step %c256 {
      %9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
      %11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
      %subview_4 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %11 step %c1 {
        %subview_6 = memref.subview %subview_5[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %subview_4[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %12 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %12, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
      %15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
      %17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
      %18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
      %19 = arith.cmpi eq, %18, %c0 : index
      %20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
      %21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
      %22 = arith.cmpi eq, %21, %c0 : index
      %23 = arith.ori %22, %19 : i1
      %24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
      %25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
      %26 = arith.cmpi eq, %25, %c0 : index
      %27 = arith.ori %26, %23 : i1
      %28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %18 step %c1 {
          scf.for %arg5 = %c0 to %21 step %c1 {
            scf.for %arg6 = %c0 to %25 step %c1 {
              %47 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
              %48 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
              %49 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
              %subview_12 = memref.subview %subview_9[%arg4, %48, %49, %47] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %50 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %50, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %30:2 = affine.delinearize_index %14 into (16, 16) : index, index
      %subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
      %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
      %33 = arith.cmpi eq, %32, %c0 : index
      %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
      %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %36, %33 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %47 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
        %subview_9 = memref.subview %1[%31, %47] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
      %40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = iree_gpu.multi_mma %42, %43, %44 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
      %46 = vector.broadcast %45 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %46 : vector<1x1x1x1x4x1xf32>
    }
    %subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %13 step %c256 {
      %14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
      %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
      %16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
      %subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %16 step %c1 {
        %subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
      %15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
      %17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
      %18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
      %19 = arith.cmpi eq, %18, %c0 : index
      %20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
      %21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
      %22 = arith.cmpi eq, %21, %c0 : index
      %23 = arith.ori %22, %19 : i1
      %24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
      %25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
      %26 = arith.cmpi eq, %25, %c0 : index
      %27 = arith.ori %26, %23 : i1
      %28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %18 step %c1 {
          scf.for %arg5 = %c0 to %21 step %c1 {
            scf.for %arg6 = %c0 to %25 step %c1 {
              %47 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
              %48 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
              %49 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
              %subview_12 = memref.subview %subview_9[%arg4, %48, %49, %47] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %50 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %50, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %30:2 = affine.delinearize_index %14 into (16, 16) : index, index
      %subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
      %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
      %33 = arith.cmpi eq, %32, %c0 : index
      %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
      %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %36, %33 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %47 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
        %subview_9 = memref.subview %1[%31, %47] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
      %40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = iree_gpu.multi_mma %42, %43, %44 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
      %46 = vector.broadcast %45 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %46 : vector<1x1x1x1x4x1xf32>
    }
    %subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %13 step %c256 {
      %14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
      %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
      %16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
      %subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %16 step %c1 {
        %subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LowerIREEGPUOpsPass (iree-gpu-lower-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
      %15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
      %17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
      %18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
      %19 = arith.cmpi eq, %18, %c0 : index
      %20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
      %21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
      %22 = arith.cmpi eq, %21, %c0 : index
      %23 = arith.ori %22, %19 : i1
      %24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
      %25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
      %26 = arith.cmpi eq, %25, %c0 : index
      %27 = arith.ori %26, %23 : i1
      %28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %18 step %c1 {
          scf.for %arg5 = %c0 to %21 step %c1 {
            scf.for %arg6 = %c0 to %25 step %c1 {
              %51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
              %52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
              %subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %30:2 = affine.delinearize_index %14 into (16, 16) : index, index
      %subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
      %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
      %33 = arith.cmpi eq, %32, %c0 : index
      %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
      %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %36, %33 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
        %subview_9 = memref.subview %1[%31, %51] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
      %40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
      %48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
      %50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %50 : vector<1x1x1x1x4x1xf32>
    }
    %subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %13 step %c256 {
      %14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
      %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
      %16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
      %subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %16 step %c1 {
        %subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After UnrollAnnotatedLoopsPass (iree-codegen-unroll-annotated-loops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
      %15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
      %16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
      %17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
      %18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
      %19 = arith.cmpi eq, %18, %c0 : index
      %20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
      %21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
      %22 = arith.cmpi eq, %21, %c0 : index
      %23 = arith.ori %22, %19 : i1
      %24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
      %25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
      %26 = arith.cmpi eq, %25, %c0 : index
      %27 = arith.ori %26, %23 : i1
      %28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %18 step %c1 {
          scf.for %arg5 = %c0 to %21 step %c1 {
            scf.for %arg6 = %c0 to %25 step %c1 {
              %51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
              %52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
              %subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %30:2 = affine.delinearize_index %14 into (16, 16) : index, index
      %subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
      %32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
      %33 = arith.cmpi eq, %32, %c0 : index
      %34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
      %35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %36, %33 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
        %subview_9 = memref.subview %1[%31, %51] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
      %40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
      %48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
      %50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %50 : vector<1x1x1x1x4x1xf32>
    }
    %subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %13 step %c256 {
      %14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
      %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
      %16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
      %subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %16 step %c1 {
        %subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
    %alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
    %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
    %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
    %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
    %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
    %15 = arith.cmpi eq, %14, %c0 : index
    %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
    %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
    %18 = arith.cmpi eq, %17, %c0 : index
    %19 = arith.ori %18, %15 : i1
    %subview_4 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %20:2 = affine.delinearize_index %10 into (16, 16) : index, index
    %subview_5 = memref.subview %alloc_2[%20#0, %20#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %21 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%20#1, %arg1)
    %22 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %21)
    %23 = arith.cmpi eq, %22, %c0 : index
    %expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
    %expand_shape_6 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
    %24 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %11#2)
      %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
      %30 = arith.cmpi eq, %29, %c0 : index
      %31 = arith.ori %30, %19 : i1
      %32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_9 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%14, %17, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %14 step %c1 {
          scf.for %arg5 = %c0 to %17 step %c1 {
            scf.for %arg6 = %c0 to %29 step %c1 {
              %51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
              %52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %16, %28, %arg6)
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %16, %28, %arg6)
              %subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%14, 1, %17, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %33, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %34 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%20#0]
      %35 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %23, %36 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%21]
        %subview_9 = memref.subview %1[%34, %51] [%35, %22] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_10 = memref.subview %subview_5[0, 0] [%35, %22] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %40 = vector.transfer_read %expand_shape_6[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
      %48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
      %50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %50 : vector<1x1x1x1x4x1xf32>
    }
    %subview_7 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %25, %subview_7[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_8 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %27 step %c256 {
      %28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
      %29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
      %30 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%4]
      %subview_9 = memref.subview %collapse_shape[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_8[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %30 step %c1 {
        %subview_11 = memref.subview %subview_10[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_12 = memref.subview %subview_9[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %31 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %31, %subview_11[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
    %subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
    %subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
    %alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
    %subview_5 = memref.subview %alloc_4[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %7 = gpu.lane_id
    %8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
    %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
    %10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
    %11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
    %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
    %13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
    %14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
    %15 = arith.cmpi eq, %14, %c0 : index
    %16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
    %17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
    %18 = arith.cmpi eq, %17, %c0 : index
    %19 = arith.ori %18, %15 : i1
    %subview_6 = memref.subview %subview[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[640, 640, 20, 1], offset: ?>, #gpu.address_space<workgroup>>
    %20:2 = affine.delinearize_index %10 into (16, 16) : index, index
    %subview_7 = memref.subview %subview_3[%20#0, %20#1] [1, 1] [1, 1] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
    %21 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%20#1, %arg1)
    %22 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %21)
    %23 = arith.cmpi eq, %22, %c0 : index
    %expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %expand_shape_8 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %24 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %11#2)
      %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
      %30 = arith.cmpi eq, %29, %c0 : index
      %31 = arith.ori %30, %19 : i1
      %32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %subview_11 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloca = memref.alloca(%14, %17, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %14 step %c1 {
          scf.for %arg5 = %c0 to %17 step %c1 {
            scf.for %arg6 = %c0 to %29 step %c1 {
              %51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
              %52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %16, %28, %arg6)
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %16, %28, %arg6)
              %subview_14 = memref.subview %subview_11[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_15 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
              %54 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %54, %subview_15[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
            }
          }
        }
        %alloca_12 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_12 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_13 = memref.subview %alloca_12[0, 0, 0, 0] [%14, 1, %17, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_13 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_12 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      vector.transfer_write %33, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[640, 640, 20, 1], offset: ?>, #gpu.address_space<workgroup>>
      %34 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%20#0]
      %35 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%34)
      %36 = arith.cmpi eq, %35, %c0 : index
      %37 = arith.ori %23, %36 : i1
      scf.if %37 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%21]
        %subview_11 = memref.subview %1[%34, %51] [%35, %22] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_12 = memref.subview %subview_7[0, 0] [%35, %22] [1, 1] : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_11, %subview_12 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
      %48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
      %50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %50 : vector<1x1x1x1x4x1xf32>
    }
    %subview_9 = memref.subview %subview_5[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    %25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    vector.transfer_write %25, %subview_9[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %subview_10 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
    %27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %27 step %c256 {
      %28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
      %29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
      %30 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%4]
      %subview_11 = memref.subview %collapse_shape[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_12 = memref.subview %subview_10[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      scf.for %arg3 = %c0 to %30 step %c1 {
        %subview_13 = memref.subview %subview_12[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_14 = memref.subview %subview_11[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
        %31 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        vector.transfer_write %31, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
    %subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
    %subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
    %alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
    %6 = gpu.lane_id
    %7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
    %8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
    %9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
    %10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
    %11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
    %12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
    %13 = arith.cmpi eq, %12, %c0 : index
    %14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
    %15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
    %16 = arith.cmpi eq, %15, %c0 : index
    %17 = arith.ori %16, %13 : i1
    %18:2 = affine.delinearize_index %9 into (16, 16) : index, index
    %subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
    %19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
    %20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
    %21 = arith.cmpi eq, %20, %c0 : index
    %expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
      %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
      %30 = arith.cmpi eq, %29, %c0 : index
      %31 = arith.ori %30, %17 : i1
      %32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %alloca = memref.alloca(%12, %15, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %12 step %c1 {
          scf.for %arg5 = %c0 to %15 step %c1 {
            scf.for %arg6 = %c0 to %29 step %c1 {
              %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %28, %arg6)
              %54 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %28, %arg6)
              %55 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
              %56 = vector.transfer_read %0[%52, %53, %54, %55], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %56, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
            }
          }
        }
        %alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      %34 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
      vector.transfer_write %33, %alloc[%10#0, %c0, %10#1, %34] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
      %35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
      %36 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%35)
      %37 = arith.cmpi eq, %36, %c0 : index
      %38 = arith.ori %21, %37 : i1
      scf.if %38 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %52 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%19]
        %subview_7 = memref.subview %1[%35, %52] [%36, %20] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_8 = memref.subview %alloc_2[%18#0, %18#1] [%36, %20] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_7, %subview_8 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %39 = vector.transfer_read %expand_shape[%5#0, %c0, %5#1, %7, %c0, %8], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %41 = vector.transfer_read %expand_shape_6[%c0, %8, %c0, %7], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %43 = vector.extract %40[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %44 = vector.extract %42[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %45 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<1x4xf16> to vector<4xf16>
      %48 = vector.shape_cast %45 : vector<4x1xf32> to vector<4xf32>
      %49 = amdgpu.mfma %46 * %47 + %48 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %50 = vector.shape_cast %49 : vector<4xf32> to vector<4x1xf32>
      %51 = vector.broadcast %50 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %51 : vector<1x1x1x1x4x1xf32>
    }
    %23 = vector.transpose %22, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
    %25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
    vector.transfer_write %23, %alloc_4[%5#0, %c0, %5#1, %24, %c0, %25] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%3)
    %27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%3)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %27 step %c256 {
      %28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
      %29 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%3]
      scf.for %arg3 = %c0 to %29 step %c1 {
        %30 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%28#2)[%arg3]
        %31 = vector.transfer_read %collapse_shape[%28#0, %c0, %28#1, %30], %cst {in_bounds = [true, true, true, true]} : memref<2x1x32x16xf32, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        %32 = affine.apply affine_map<(d0, d1)[s0] -> (d0 * 16 + d1 * 4 + s0)>(%arg1, %28#2)[%arg3]
        vector.transfer_write %31, %2[%28#0, %arg0, %28#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
    %subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
    %subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
    %alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
    %6 = gpu.lane_id
    %7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
    %8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
    %9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
    %10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
    %11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
    %12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
    %13 = arith.cmpi eq, %12, %c0 : index
    %14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
    %15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
    %16 = arith.cmpi eq, %15, %c0 : index
    %17 = arith.ori %16, %13 : i1
    %18:2 = affine.delinearize_index %9 into (16, 16) : index, index
    %subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
    %19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
    %20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
    %21 = arith.cmpi eq, %20, %c0 : index
    %expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
      %29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
      %30 = arith.cmpi eq, %29, %c0 : index
      %31 = arith.ori %30, %17 : i1
      %32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %alloca = memref.alloca(%12, %15, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %12 step %c1 {
          scf.for %arg5 = %c0 to %15 step %c1 {
            scf.for %arg6 = %c0 to %29 step %c1 {
              %52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
              %53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %28, %arg6)
              %54 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %28, %arg6)
              %55 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
              %56 = vector.transfer_read %0[%52, %53, %54, %55], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %56, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
            }
          }
        }
        %alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      %34 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
      vector.transfer_write %33, %alloc[%10#0, %c0, %10#1, %34] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
      %35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
      %36 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%35)
      %37 = arith.cmpi eq, %36, %c0 : index
      %38 = arith.ori %21, %37 : i1
      scf.if %38 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
      } else {
        %52 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%19]
        %subview_7 = memref.subview %1[%35, %52] [%36, %20] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
        %subview_8 = memref.subview %alloc_2[%18#0, %18#1] [%36, %20] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_7, %subview_8 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      }
      gpu.barrier
      %39 = vector.transfer_read %expand_shape[%5#0, %c0, %5#1, %7, %c0, %8], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
      %40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
      %41 = vector.transfer_read %expand_shape_6[%c0, %8, %c0, %7], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
      %42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
      %43 = vector.extract %40[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
      %44 = vector.extract %42[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
      %45 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
      %46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
      %47 = vector.shape_cast %44 : vector<1x4xf16> to vector<4xf16>
      %48 = vector.shape_cast %45 : vector<4x1xf32> to vector<4xf32>
      %49 = amdgpu.mfma %46 * %47 + %48 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
      %50 = vector.shape_cast %49 : vector<4xf32> to vector<4x1xf32>
      %51 = vector.broadcast %50 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
      scf.yield %51 : vector<1x1x1x1x4x1xf32>
    }
    %23 = vector.transpose %22, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
    %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
    %25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
    vector.transfer_write %23, %alloc_4[%5#0, %c0, %5#1, %24, %c0, %25] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
    %26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%3)
    %27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%3)
    gpu.barrier
    scf.for %arg2 = %thread_id_x to %27 step %c256 {
      %28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
      %29 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%3]
      scf.for %arg3 = %c0 to %29 step %c1 {
        %30 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%28#2)[%arg3]
        %31 = vector.transfer_read %collapse_shape[%28#0, %c0, %28#1, %30], %cst {in_bounds = [true, true, true, true]} : memref<2x1x32x16xf32, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
        %32 = affine.apply affine_map<(d0, d1)[s0] -> (d0 * 16 + d1 * 4 + s0)>(%arg1, %28#2)[%arg3]
        vector.transfer_write %31, %2[%28#0, %arg0, %28#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
      }
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c721 = arith.constant 721 : index
  %cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
  %thread_id_x = gpu.thread_id  x
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (17, 81) {
    %3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
    %alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
    %subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
    %alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
    %subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
    %alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
    %4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
    gpu.barrier
    %5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
    %6 = gpu.lane_id
    %7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
    %8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
    %9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
    %10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
    %11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
    %12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
    %13 = arith.cmpi eq, %12, %c0 : index
    %14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
    %15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
    %16 = arith.cmpi eq, %15, %c0 : index
    %17 = arith.ori %16, %13 : i1
    %18:2 = affine.delinearize_index %9 into (16, 16) : index, index
    %subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
    %19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
    %20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
    %21 = arith.cmpi eq, %20, %c0 : index
    %expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
    %22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
      gpu.barrier
      %26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
      %27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
      %28 = arith.cmpi eq, %27, %c0 : index
      %29 = arith.ori %28, %17 : i1
      %30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        %alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
        ^bb0(%out: f16):
          linalg.yield %cst_0 : f16
        }
        scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
      } else {
        %alloca = memref.alloca(%12, %15, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
        scf.for %arg4 = %c0 to %12 step %c1 {
          scf.for %arg5 = %c0 to %15 step %c1 {
            scf.for %arg6 = %c0 to %27 step %c1 {
              %50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
              %51 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %26, %arg6)
              %52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %26, %arg6)
              %53 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg6)
              %54 = vector.transfer_read %0[%50, %51, %52, %53], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
              vector.transfer_write %54, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
            }
          }
        }
        %alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
        linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
        %subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
        scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
      }
      %31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
      %32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
      vector.transfer_write %31, %alloc[%10#0, %c0, %10#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
      %33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
      %34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
      %35 = arith.cmpi eq, %34, %c0 : index
      %36 = arith.ori %21, %35 : i1
      scf.if %36 {
        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.addre
diff --git a/wo.txt b/wo.txt