Skip to content

Instantly share code, notes, and snippets.

@banach-space
Created March 14, 2023 16:43
Show Gist options
  • Save banach-space/d2dba6befe4a127ded94e3a1c154f401 to your computer and use it in GitHub Desktop.
Save banach-space/d2dba6befe4a127ded94e3a1c154f401 to your computer and use it in GitHub Desktop.
Generated using the pad_fusion branch of IREE
This file has been truncated, but you can view the full file.
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%c30 = arith.constant 30 : index
%c18 = arith.constant 18 : index
%c1 = arith.constant 1 : index
hal.return %c30, %c18, %c1 : index, index, index
}
builtin.module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%7 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%8 = affine.min affine_map<(d0) -> (1920, d0)>(%7)
%9 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%10, %8)
%12 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%6, %10, %8)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %8, 0], sizes = [1, %c60, %11, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%padded = tensor.pad %13 low[0, 0, %6, 0] high[0, 0, %12, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32>
%14 = tensor.empty() : tensor<1x60x64x1xi32>
%cast = tensor.cast %14 : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
}
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32>
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After TileAndDecomposeAttention (iree-linalg-ext-tile-and-decompose-attention) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32>
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32>
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
}
// -----// IR Dump After TileAndDecomposeWinogradTransform (iree-linalg-ext-tile-and-decompose-winograd) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] {
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
tensor.yield %c0_i32 : i32
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32>
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c1080 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c1 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c60 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg6 = %c0 to %c64 step %c32 iter_args(%arg7 = %arg5) -> (tensor<1x60x64x1xi32>) {
%17 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x60x64x1xi32>) {
%18 = affine.min affine_map<(d0) -> (d0, 1)>(%arg2)
%19 = affine.min affine_map<(d0) -> (d0 + 1, 1)>(%arg2)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %18)
%21 = affine.min affine_map<(d0) -> (d0, 60)>(%arg4)
%22 = affine.min affine_map<(d0) -> (d0 + 1, 60)>(%arg4)
%23 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%22, %21)
%dim = tensor.dim %13, %c2 : tensor<1x60x?x1xi32>
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg6)
%25 = affine.max affine_map<(d0, d1) -> (d0 - d1, 0)>(%arg6, %7)
%26 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%25, %dim)
%27 = affine.max affine_map<(d0, d1) -> (d0 - d1 + 74, 0)>(%arg6, %7)
%28 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%27, %dim)
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26)
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%24, %28, %26)
%31 = affine.min affine_map<(d0) -> (d0, 1)>(%arg8)
%32 = affine.min affine_map<(d0) -> (d0 + 1, 1)>(%arg8)
%33 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%32, %31)
%extracted_slice = tensor.extract_slice %13[%18, %21, %26, %31] [%20, %23, %29, %33] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<?x?x?x?xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %24, 0] high[0, 0, %30, 0] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %c0_i32 : i32
} {__internal_linalg_transform__ = "1"} : tensor<?x?x?x?xi32> to tensor<?x?x?x?xi32>
%cast = tensor.cast %padded : tensor<?x?x?x?xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %cst[0, 0, %arg8] [1, 43, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x43x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%34 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%35 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%cast, %extracted_slice_0 : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%34 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %35 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %17 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgFuse (linalg-fuse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyTilePass (iree-linalg-strategy-tile-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) {
%25 = scf.for %arg8 = %c0 to %c43 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x32x1xi32>) {
%extracted_slice_1 = tensor.extract_slice %padded[0, %arg6, %arg8, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %cst[%arg6, %arg8, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%26 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg9 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
scf.yield %26 : tensor<1x1x32x1xi32>
}
scf.yield %25 : tensor<1x1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) {
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg7 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
scf.yield %25 : tensor<1x1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) {
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg7 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
scf.yield %25 : tensor<1x1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyDecomposePass (iree-linalg-strategy-decompose-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) {
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%extracted_slice_5 = tensor.extract_slice %arg7[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%extracted_slice_5 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
%inserted_slice_6 = tensor.insert_slice %25 into %arg7[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
scf.yield %inserted_slice_6 : tensor<1x1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %25 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %25 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %25 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %25 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] {
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %25 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%22 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6)
%25 = affine.max affine_map<(d0, d1) -> (d0 - d1, 0)>(%arg6, %16)
%26 = affine.min affine_map<(d0, d1, d2) -> (d0, d1 - d2)>(%25, %20, %18)
%27 = affine.max affine_map<(d0, d1) -> (d0 - d1 + 32, 0)>(%arg6, %16)
%28 = affine.min affine_map<(d0, d1, d2) -> (d0, d1 - d2)>(%27, %20, %18)
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26)
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 32)>(%24, %28, %26)
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice_3 low[0, 0, %24, 0] high[0, 0, %30, 0] {
^bb0(%arg8: index, %arg9: index, %arg10: index, %arg11: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%cast = tensor.cast %padded : tensor<1x1x?x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %cast[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%31 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %31 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After ConcretizePadResultShape (iree-codegen-concretize-pad-result-shape) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%22 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32>
%extracted_slice_1 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) {
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6)
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %16)
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %20, %18)
%27 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %16)
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%27, %20, %18)
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26)
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 32)>(%24, %28, %26)
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%padded = tensor.pad %extracted_slice_3 low[0, 0, %24, 0] high[0, 0, %30, 0] {
^bb0(%arg8: index, %arg9: index, %arg10: index, %arg11: index):
tensor.yield %c0_i32 : i32
} : tensor<1x1x?x1xi32> to tensor<1x1x32x1xi32>
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32>
%extracted_slice_5 = tensor.extract_slice %padded[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32>
%31 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32>
scf.yield %31 : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyVectorizePass (iree-linalg-strategy-vectorize-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) {
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%22 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6)
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %16)
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %20, %18)
%27 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %16)
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%27, %20, %18)
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%30 = tensor.empty() : tensor<1x1x32x1xi32>
%31 = vector.transfer_write %cst, %30[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %24, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %15 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After LinalgVectorizationExpert (linalg-vectorization-expert-driver) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<1x1x32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32>
%37 = vector.extract %35[0] : vector<1x1xi32>
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32>
%40 = arith.muli %34, %39 : vector<1x32xi32>
%41 = arith.addi %40, %36 : vector<1x32xi32>
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%38 = arith.muli %34, %37 : vector<32xi32>
%39 = arith.addi %38, %36 : vector<32xi32>
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = tensor.empty() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%38 = arith.muli %34, %37 : vector<32xi32>
%39 = arith.addi %38, %36 : vector<32xi32>
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%cst = arith.constant dense<0> : vector<32x1xi32>
%c43 = arith.constant 43 : index
%c32 = arith.constant 32 : index
%c60 = arith.constant 60 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c1920 = arith.constant 1920 : index
%c1080 = arith.constant 1080 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%6 = bufferization.alloc_tensor() : tensor<1x1x32x1xi32>
scf.for %arg0 = %2 to %c1080 step %3 {
scf.for %arg1 = %4 to %c1920 step %5 {
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32>
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9)
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11)
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10)
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32>
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) {
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32>
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32>
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) {
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6)
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17)
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19)
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17)
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27)
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32>
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32>
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32>
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32>
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32>
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32>
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32>
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%38 = arith.muli %34, %37 : vector<32xi32>
%39 = arith.addi %38, %36 : vector<32xi32>
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32>
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32>
scf.yield %expanded : tensor<1x32x1xi32>
}
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32>
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32>
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32>
}
scf.yield %16 : tensor<1x60x64x1xi32>
}
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>>
}
}
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %subview) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%14 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%15 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %11, %9)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %17)
%subview_3 = memref.subview %subview_1[0, %arg2, %17, 0] [1, 1, %20, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%21 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%15, %arg6)
%23 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %15)
%24 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%23, %19, %17)
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %15)
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %19, %17)
%27 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%26, %24)
%subview_8 = memref.subview %subview_3[0, 0, %24, 0] [1, 1, %27, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%28 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %28, %alloca[%c0, %c0, %22, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_10 = memref.collapse_shape %arg7 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%30 = vector.transfer_read %0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%31 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%32 = vector.shuffle %30, %30 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%33 = arith.muli %29, %32 : vector<32xi32>
%34 = arith.addi %33, %31 : vector<32xi32>
vector.transfer_write %34, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg7 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
%subview_7 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
scf.yield %arg5 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
scf.yield %14 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %subview) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%14 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%15 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %11, %9)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %17)
%subview_3 = memref.subview %subview_1[0, %arg2, %17, 0] [1, 1, %20, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%21 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%15, %arg6)
%23 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %15)
%24 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%23, %19, %17)
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %15)
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %19, %17)
%27 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%26, %24)
%subview_8 = memref.subview %subview_3[0, 0, %24, 0] [1, 1, %27, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%28 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %28, %alloca[%c0, %c0, %22, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_10 = memref.collapse_shape %arg7 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%29 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%30 = vector.transfer_read %0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%31 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%32 = vector.shuffle %30, %30 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%33 = arith.muli %29, %32 : vector<32xi32>
%34 = arith.addi %33, %31 : vector<32xi32>
vector.transfer_write %34, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg7 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
%subview_7 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
scf.yield %arg5 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
scf.yield %14 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg2 = %c0 to %c60 step %c1 {
scf.for %arg3 = %c0 to %c64 step %c32 {
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15)
%subview_3 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg4 = %c0 to %c43 step %c1 {
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15)
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13)
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21)
%subview_8 = memref.subview %subview_3[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%25 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_10 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%28 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%30 = arith.muli %26, %29 : vector<32xi32>
%31 = arith.addi %30, %28 : vector<32xi32>
vector.transfer_write %31, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
%subview_7 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg2 = %c0 to %c60 step %c1 {
scf.for %arg3 = %c0 to %c64 step %c32 {
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15)
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg4 = %c0 to %c43 step %c1 {
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15)
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13)
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21)
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%30 = arith.muli %26, %29 : vector<32xi32>
%31 = arith.addi %30, %28 : vector<32xi32>
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_3 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg2 = %c0 to %c60 step %c1 {
scf.for %arg3 = %c0 to %c64 step %c32 {
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15)
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg4 = %c0 to %c43 step %c1 {
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15)
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13)
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21)
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%30 = arith.muli %26, %29 : vector<32xi32>
%31 = arith.addi %30, %28 : vector<32xi32>
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg2 = %c0 to %c60 step %c1 {
scf.for %arg3 = %c0 to %c64 step %c32 {
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15)
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg4 = %c0 to %c43 step %c1 {
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15)
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13)
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21)
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32>
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32>
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%30 = arith.muli %26, %29 : vector<32xi32>
%31 = arith.addi %30, %28 : vector<32xi32>
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1080 = arith.constant 1080 : index
%c1920 = arith.constant 1920 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1080 step %4 {
scf.for %arg1 = %5 to %c1920 step %6 {
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1)
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1)
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9)
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
scf.for %arg2 = %c0 to %c60 step %c1 {
scf.for %arg3 = %c0 to %c64 step %c32 {
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9)
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7)
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15)
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
scf.for %arg4 = %c0 to %c43 step %c1 {
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15)
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13)
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21)
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%30 = arith.muli %26, %29 : vector<32xi32>
%31 = arith.addi %30, %28 : vector<32xi32>
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
}
}
return
}
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%4)
%6 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%4)
%7 = affine.min affine_map<(d0) -> (1920, d0)>(%6)
%8 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%4)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%9, %7)
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%5, %arg1)
%12 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg1, %5)
%13 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%12, %9, %7)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg1, %5)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %9, %7)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_2 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_3 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_5 = memref.subview %subview_2[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%4)
%6 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%4)
%7 = affine.min affine_map<(d0) -> (1920, d0)>(%6)
%8 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%4)
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8)
%10 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%9, %7)
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%5, %arg1)
%12 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg1, %5)
%13 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%12, %9, %7)
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg1, %5)
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %9, %7)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_2 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_3 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_5 = memref.subview %subview_2[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32>
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%28 = arith.muli %24, %27 : vector<32xi32>
%29 = arith.addi %28, %26 : vector<32xi32>
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = bufferization.to_memref %cst : memref<1x43x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6]
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7]
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5]
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7]
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5]
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7]
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2)
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11)
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13)
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11)
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32>
%26 = vector.broadcast %25 : i32 to vector<1xi32>
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%29 = arith.muli %24, %28 : vector<32xi32>
%30 = arith.addi %29, %27 : vector<32xi32>
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%22 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %22, %alloca[%c0, %c0, %16, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%24 = vector.broadcast %extracted : i32 to vector<1xi32>
%25 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%26 = vector.shuffle %24, %24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%27 = arith.muli %23, %26 : vector<32xi32>
%28 = arith.addi %27, %25 : vector<32xi32>
vector.store %28, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%22 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32>
vector.transfer_write %22, %alloca[%c0, %c0, %16, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32>
%23 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%24 = vector.broadcast %extracted : i32 to vector<1xi32>
%25 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%26 = vector.shuffle %24, %24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%27 = arith.muli %23, %26 : vector<32xi32>
%28 = arith.addi %27, %25 : vector<32xi32>
vector.store %28, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%16 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%17 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
memref.store %17, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%18 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
memref.store %18, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%19 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
memref.store %19, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%20 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
memref.store %20, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%21 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
memref.store %21, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%22 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
memref.store %22, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%23 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
memref.store %23, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%24 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
memref.store %24, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%25 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
memref.store %25, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%26 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
memref.store %26, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%27 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
memref.store %27, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%28 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
memref.store %28, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%29 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
memref.store %29, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%30 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
memref.store %30, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%31 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
memref.store %31, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%32 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
memref.store %32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%33 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
memref.store %33, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%34 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
memref.store %34, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%35 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
memref.store %35, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%36 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
memref.store %36, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%37 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
memref.store %37, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%38 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
memref.store %38, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%39 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
memref.store %39, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%40 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
memref.store %40, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%41 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
memref.store %41, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%42 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
memref.store %42, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%43 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
memref.store %43, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%44 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
memref.store %44, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%45 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
memref.store %45, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%46 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
memref.store %46, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%47 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
memref.store %47, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %14, %12)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %14, %12)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%54 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
memref.store %54, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
%55 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
memref.store %55, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
%56 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
memref.store %56, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
%57 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
memref.store %57, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
%58 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
memref.store %58, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
%59 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
memref.store %59, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
%60 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
memref.store %60, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
%61 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
memref.store %61, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
%62 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
memref.store %62, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
%63 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
memref.store %63, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
%64 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
memref.store %64, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
%65 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
memref.store %65, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
%66 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
memref.store %66, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
%67 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
memref.store %67, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
%68 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
memref.store %68, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
%69 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
memref.store %69, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
%70 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
memref.store %70, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
%71 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
memref.store %71, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
%72 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
memref.store %72, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
%73 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
memref.store %73, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
%74 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
memref.store %74, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
%75 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
memref.store %75, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
%76 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
memref.store %76, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
%77 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
memref.store %77, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
%78 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
memref.store %78, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
%79 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
memref.store %79, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
%80 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
memref.store %80, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
%81 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
memref.store %81, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
%82 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
memref.store %82, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
%83 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
memref.store %83, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
%84 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
memref.store %84, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
%85 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
memref.store %85, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%86 = arith.cmpi sgt, %53, %c0 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c1 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %87 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c2 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %89 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c3 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %91 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c4 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %93 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c5 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %95 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c6 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %97 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c7 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %99 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c8 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %101 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c9 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %103 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c10 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %105 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c11 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %107 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c12 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %109 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c13 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %111 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c14 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %113 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c15 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %115 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi sgt, %53, %c16 : index
%119 = scf.if %118 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %117 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %117 : vector<32x1xi32>
}
%120 = arith.cmpi sgt, %53, %c17 : index
%121 = scf.if %120 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %119 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %119 : vector<32x1xi32>
}
%122 = arith.cmpi sgt, %53, %c18 : index
%123 = scf.if %122 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %121 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %121 : vector<32x1xi32>
}
%124 = arith.cmpi sgt, %53, %c19 : index
%125 = scf.if %124 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %123 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %123 : vector<32x1xi32>
}
%126 = arith.cmpi sgt, %53, %c20 : index
%127 = scf.if %126 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %125 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %125 : vector<32x1xi32>
}
%128 = arith.cmpi sgt, %53, %c21 : index
%129 = scf.if %128 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %127 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %127 : vector<32x1xi32>
}
%130 = arith.cmpi sgt, %53, %c22 : index
%131 = scf.if %130 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %129 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %129 : vector<32x1xi32>
}
%132 = arith.cmpi sgt, %53, %c23 : index
%133 = scf.if %132 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %131 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %131 : vector<32x1xi32>
}
%134 = arith.cmpi sgt, %53, %c24 : index
%135 = scf.if %134 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %133 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %133 : vector<32x1xi32>
}
%136 = arith.cmpi sgt, %53, %c25 : index
%137 = scf.if %136 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %135 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %135 : vector<32x1xi32>
}
%138 = arith.cmpi sgt, %53, %c26 : index
%139 = scf.if %138 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %137 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %137 : vector<32x1xi32>
}
%140 = arith.cmpi sgt, %53, %c27 : index
%141 = scf.if %140 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %139 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %139 : vector<32x1xi32>
}
%142 = arith.cmpi sgt, %53, %c28 : index
%143 = scf.if %142 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %141 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %141 : vector<32x1xi32>
}
%144 = arith.cmpi sgt, %53, %c29 : index
%145 = scf.if %144 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %143 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %143 : vector<32x1xi32>
}
%146 = arith.cmpi sgt, %53, %c30 : index
%147 = scf.if %146 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %145 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %145 : vector<32x1xi32>
}
%148 = arith.cmpi sgt, %53, %c31 : index
%149 = scf.if %148 -> (vector<32x1xi32>) {
%219 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%220 = vector.broadcast %219 : i32 to vector<1xi32>
%221 = vector.insert %220, %147 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %221 : vector<32x1xi32>
} else {
scf.yield %147 : vector<32x1xi32>
}
%150 = arith.cmpi slt, %48, %c32 : index
scf.if %150 {
%219 = vector.extract %149[0, 0] : vector<32x1xi32>
memref.store %219, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%219 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%220 = vector.extract %149[1, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%219 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%220 = vector.extract %149[2, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%219 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%220 = vector.extract %149[3, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%219 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%220 = vector.extract %149[4, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%219 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%220 = vector.extract %149[5, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%219 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%220 = vector.extract %149[6, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%219 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%220 = vector.extract %149[7, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%219 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%220 = vector.extract %149[8, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%219 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%220 = vector.extract %149[9, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%219 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%220 = vector.extract %149[10, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%219 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%220 = vector.extract %149[11, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%219 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%220 = vector.extract %149[12, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%219 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%220 = vector.extract %149[13, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%219 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%220 = vector.extract %149[14, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%219 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%220 = vector.extract %149[15, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%182 = arith.cmpi slt, %181, %c32 : index
scf.if %182 {
%219 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%220 = vector.extract %149[16, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%183 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%184 = arith.cmpi slt, %183, %c32 : index
scf.if %184 {
%219 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%220 = vector.extract %149[17, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%185 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%186 = arith.cmpi slt, %185, %c32 : index
scf.if %186 {
%219 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%220 = vector.extract %149[18, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%187 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%188 = arith.cmpi slt, %187, %c32 : index
scf.if %188 {
%219 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%220 = vector.extract %149[19, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%189 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%190 = arith.cmpi slt, %189, %c32 : index
scf.if %190 {
%219 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%220 = vector.extract %149[20, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%191 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%192 = arith.cmpi slt, %191, %c32 : index
scf.if %192 {
%219 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%220 = vector.extract %149[21, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%193 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%194 = arith.cmpi slt, %193, %c32 : index
scf.if %194 {
%219 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%220 = vector.extract %149[22, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%195 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%196 = arith.cmpi slt, %195, %c32 : index
scf.if %196 {
%219 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%220 = vector.extract %149[23, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%197 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%198 = arith.cmpi slt, %197, %c32 : index
scf.if %198 {
%219 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%220 = vector.extract %149[24, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%199 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%200 = arith.cmpi slt, %199, %c32 : index
scf.if %200 {
%219 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%220 = vector.extract %149[25, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%201 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%202 = arith.cmpi slt, %201, %c32 : index
scf.if %202 {
%219 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%220 = vector.extract %149[26, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%203 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%204 = arith.cmpi slt, %203, %c32 : index
scf.if %204 {
%219 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%220 = vector.extract %149[27, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%205 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%206 = arith.cmpi slt, %205, %c32 : index
scf.if %206 {
%219 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%220 = vector.extract %149[28, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%207 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%208 = arith.cmpi slt, %207, %c32 : index
scf.if %208 {
%219 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%220 = vector.extract %149[29, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%209 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%210 = arith.cmpi slt, %209, %c32 : index
scf.if %210 {
%219 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%220 = vector.extract %149[30, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%211 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%212 = arith.cmpi slt, %211, %c32 : index
scf.if %212 {
%219 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%220 = vector.extract %149[31, 0] : vector<32x1xi32>
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32>
} else {
}
%213 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%214 = vector.broadcast %extracted : i32 to vector<1xi32>
%215 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%216 = vector.shuffle %214, %214 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%217 = arith.muli %213, %216 : vector<32xi32>
%218 = arith.addi %217, %215 : vector<32xi32>
vector.store %218, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32>
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32>
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32>
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32>
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32>
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32>
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32>
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32>
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32>
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32>
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32>
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32>
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32>
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32>
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32>
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32>
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32>
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32>
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32>
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32>
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32>
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32>
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32>
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32>
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32>
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32>
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32>
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32>
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32>
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32>
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32>
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6]
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6]
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44)
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2)
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44)
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50)
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%54 = arith.cmpi sgt, %53, %c0 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %53, %c1 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %53, %c2 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %53, %c3 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %53, %c4 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %53, %c5 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %53, %c6 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %53, %c7 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %53, %c8 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %53, %c9 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %53, %c10 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %53, %c11 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %53, %c12 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %53, %c13 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %53, %c14 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %53, %c15 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi sgt, %53, %c16 : index
%87 = scf.if %86 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %85 : vector<32x1xi32>
}
%88 = arith.cmpi sgt, %53, %c17 : index
%89 = scf.if %88 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %87 : vector<32x1xi32>
}
%90 = arith.cmpi sgt, %53, %c18 : index
%91 = scf.if %90 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %89 : vector<32x1xi32>
}
%92 = arith.cmpi sgt, %53, %c19 : index
%93 = scf.if %92 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %91 : vector<32x1xi32>
}
%94 = arith.cmpi sgt, %53, %c20 : index
%95 = scf.if %94 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %93 : vector<32x1xi32>
}
%96 = arith.cmpi sgt, %53, %c21 : index
%97 = scf.if %96 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %95 : vector<32x1xi32>
}
%98 = arith.cmpi sgt, %53, %c22 : index
%99 = scf.if %98 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %97 : vector<32x1xi32>
}
%100 = arith.cmpi sgt, %53, %c23 : index
%101 = scf.if %100 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %99 : vector<32x1xi32>
}
%102 = arith.cmpi sgt, %53, %c24 : index
%103 = scf.if %102 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %101 : vector<32x1xi32>
}
%104 = arith.cmpi sgt, %53, %c25 : index
%105 = scf.if %104 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %103 : vector<32x1xi32>
}
%106 = arith.cmpi sgt, %53, %c26 : index
%107 = scf.if %106 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %105 : vector<32x1xi32>
}
%108 = arith.cmpi sgt, %53, %c27 : index
%109 = scf.if %108 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %107 : vector<32x1xi32>
}
%110 = arith.cmpi sgt, %53, %c28 : index
%111 = scf.if %110 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %109 : vector<32x1xi32>
}
%112 = arith.cmpi sgt, %53, %c29 : index
%113 = scf.if %112 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %111 : vector<32x1xi32>
}
%114 = arith.cmpi sgt, %53, %c30 : index
%115 = scf.if %114 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %113 : vector<32x1xi32>
}
%116 = arith.cmpi sgt, %53, %c31 : index
%117 = scf.if %116 -> (vector<32x1xi32>) {
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%188 = vector.broadcast %187 : i32 to vector<1xi32>
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %189 : vector<32x1xi32>
} else {
scf.yield %115 : vector<32x1xi32>
}
%118 = arith.cmpi slt, %48, %c32 : index
scf.if %118 {
%187 = vector.extract %117[0, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32>
} else {
}
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%187 = vector.extract %117[1, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
} else {
}
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%187 = vector.extract %117[2, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
} else {
}
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%187 = vector.extract %117[3, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
} else {
}
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%187 = vector.extract %117[4, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
} else {
}
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%187 = vector.extract %117[5, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
} else {
}
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%187 = vector.extract %117[6, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
} else {
}
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%187 = vector.extract %117[7, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
} else {
}
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%187 = vector.extract %117[8, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
} else {
}
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%187 = vector.extract %117[9, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
} else {
}
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%187 = vector.extract %117[10, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
} else {
}
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%187 = vector.extract %117[11, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
} else {
}
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%187 = vector.extract %117[12, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
} else {
}
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%187 = vector.extract %117[13, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
} else {
}
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%187 = vector.extract %117[14, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
} else {
}
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%187 = vector.extract %117[15, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
} else {
}
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48)
%150 = arith.cmpi slt, %149, %c32 : index
scf.if %150 {
%187 = vector.extract %117[16, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32>
} else {
}
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48)
%152 = arith.cmpi slt, %151, %c32 : index
scf.if %152 {
%187 = vector.extract %117[17, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32>
} else {
}
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48)
%154 = arith.cmpi slt, %153, %c32 : index
scf.if %154 {
%187 = vector.extract %117[18, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32>
} else {
}
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48)
%156 = arith.cmpi slt, %155, %c32 : index
scf.if %156 {
%187 = vector.extract %117[19, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32>
} else {
}
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48)
%158 = arith.cmpi slt, %157, %c32 : index
scf.if %158 {
%187 = vector.extract %117[20, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32>
} else {
}
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48)
%160 = arith.cmpi slt, %159, %c32 : index
scf.if %160 {
%187 = vector.extract %117[21, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32>
} else {
}
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48)
%162 = arith.cmpi slt, %161, %c32 : index
scf.if %162 {
%187 = vector.extract %117[22, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32>
} else {
}
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48)
%164 = arith.cmpi slt, %163, %c32 : index
scf.if %164 {
%187 = vector.extract %117[23, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32>
} else {
}
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48)
%166 = arith.cmpi slt, %165, %c32 : index
scf.if %166 {
%187 = vector.extract %117[24, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32>
} else {
}
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48)
%168 = arith.cmpi slt, %167, %c32 : index
scf.if %168 {
%187 = vector.extract %117[25, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32>
} else {
}
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48)
%170 = arith.cmpi slt, %169, %c32 : index
scf.if %170 {
%187 = vector.extract %117[26, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32>
} else {
}
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48)
%172 = arith.cmpi slt, %171, %c32 : index
scf.if %172 {
%187 = vector.extract %117[27, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32>
} else {
}
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48)
%174 = arith.cmpi slt, %173, %c32 : index
scf.if %174 {
%187 = vector.extract %117[28, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32>
} else {
}
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48)
%176 = arith.cmpi slt, %175, %c32 : index
scf.if %176 {
%187 = vector.extract %117[29, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32>
} else {
}
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48)
%178 = arith.cmpi slt, %177, %c32 : index
scf.if %178 {
%187 = vector.extract %117[30, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32>
} else {
}
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48)
%180 = arith.cmpi slt, %179, %c32 : index
scf.if %180 {
%187 = vector.extract %117[31, 0] : vector<32x1xi32>
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32>
} else {
}
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%182 = vector.broadcast %extracted : i32 to vector<1xi32>
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%185 = arith.muli %181, %184 : vector<32xi32>
%186 = arith.addi %185, %183 : vector<32xi32>
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%22 = arith.cmpi sgt, %21, %c0 : index
%23 = scf.if %22 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%24 = arith.cmpi sgt, %21, %c1 : index
%25 = scf.if %24 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %23 : vector<32x1xi32>
}
%26 = arith.cmpi sgt, %21, %c2 : index
%27 = scf.if %26 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %25 : vector<32x1xi32>
}
%28 = arith.cmpi sgt, %21, %c3 : index
%29 = scf.if %28 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %27 : vector<32x1xi32>
}
%30 = arith.cmpi sgt, %21, %c4 : index
%31 = scf.if %30 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %29 : vector<32x1xi32>
}
%32 = arith.cmpi sgt, %21, %c5 : index
%33 = scf.if %32 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %31 : vector<32x1xi32>
}
%34 = arith.cmpi sgt, %21, %c6 : index
%35 = scf.if %34 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %33 : vector<32x1xi32>
}
%36 = arith.cmpi sgt, %21, %c7 : index
%37 = scf.if %36 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %35 : vector<32x1xi32>
}
%38 = arith.cmpi sgt, %21, %c8 : index
%39 = scf.if %38 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %37 : vector<32x1xi32>
}
%40 = arith.cmpi sgt, %21, %c9 : index
%41 = scf.if %40 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %39 : vector<32x1xi32>
}
%42 = arith.cmpi sgt, %21, %c10 : index
%43 = scf.if %42 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %41 : vector<32x1xi32>
}
%44 = arith.cmpi sgt, %21, %c11 : index
%45 = scf.if %44 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %43 : vector<32x1xi32>
}
%46 = arith.cmpi sgt, %21, %c12 : index
%47 = scf.if %46 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %45 : vector<32x1xi32>
}
%48 = arith.cmpi sgt, %21, %c13 : index
%49 = scf.if %48 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %47 : vector<32x1xi32>
}
%50 = arith.cmpi sgt, %21, %c14 : index
%51 = scf.if %50 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %49 : vector<32x1xi32>
}
%52 = arith.cmpi sgt, %21, %c15 : index
%53 = scf.if %52 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %51 : vector<32x1xi32>
}
%54 = arith.cmpi sgt, %21, %c16 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %53 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %21, %c17 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %21, %c18 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %21, %c19 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %21, %c20 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %21, %c21 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %21, %c22 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %21, %c23 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %21, %c24 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %21, %c25 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %21, %c26 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %73 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %21, %c27 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %75 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %21, %c28 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %77 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %21, %c29 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %79 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %21, %c30 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %81 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %21, %c31 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %83 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi slt, %16, %c32 : index
scf.if %86 {
%155 = vector.extract %85[0, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %16, %c0] : memref<1x1x32x1xi32>
}
%87 = affine.apply affine_map<(d0) -> (d0 + 1)>(%16)
%88 = arith.cmpi slt, %87, %c32 : index
scf.if %88 {
%155 = vector.extract %85[1, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %87, %c0] : memref<1x1x32x1xi32>
}
%89 = affine.apply affine_map<(d0) -> (d0 + 2)>(%16)
%90 = arith.cmpi slt, %89, %c32 : index
scf.if %90 {
%155 = vector.extract %85[2, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %89, %c0] : memref<1x1x32x1xi32>
}
%91 = affine.apply affine_map<(d0) -> (d0 + 3)>(%16)
%92 = arith.cmpi slt, %91, %c32 : index
scf.if %92 {
%155 = vector.extract %85[3, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %91, %c0] : memref<1x1x32x1xi32>
}
%93 = affine.apply affine_map<(d0) -> (d0 + 4)>(%16)
%94 = arith.cmpi slt, %93, %c32 : index
scf.if %94 {
%155 = vector.extract %85[4, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %93, %c0] : memref<1x1x32x1xi32>
}
%95 = affine.apply affine_map<(d0) -> (d0 + 5)>(%16)
%96 = arith.cmpi slt, %95, %c32 : index
scf.if %96 {
%155 = vector.extract %85[5, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %95, %c0] : memref<1x1x32x1xi32>
}
%97 = affine.apply affine_map<(d0) -> (d0 + 6)>(%16)
%98 = arith.cmpi slt, %97, %c32 : index
scf.if %98 {
%155 = vector.extract %85[6, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %97, %c0] : memref<1x1x32x1xi32>
}
%99 = affine.apply affine_map<(d0) -> (d0 + 7)>(%16)
%100 = arith.cmpi slt, %99, %c32 : index
scf.if %100 {
%155 = vector.extract %85[7, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %99, %c0] : memref<1x1x32x1xi32>
}
%101 = affine.apply affine_map<(d0) -> (d0 + 8)>(%16)
%102 = arith.cmpi slt, %101, %c32 : index
scf.if %102 {
%155 = vector.extract %85[8, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %101, %c0] : memref<1x1x32x1xi32>
}
%103 = affine.apply affine_map<(d0) -> (d0 + 9)>(%16)
%104 = arith.cmpi slt, %103, %c32 : index
scf.if %104 {
%155 = vector.extract %85[9, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %103, %c0] : memref<1x1x32x1xi32>
}
%105 = affine.apply affine_map<(d0) -> (d0 + 10)>(%16)
%106 = arith.cmpi slt, %105, %c32 : index
scf.if %106 {
%155 = vector.extract %85[10, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %105, %c0] : memref<1x1x32x1xi32>
}
%107 = affine.apply affine_map<(d0) -> (d0 + 11)>(%16)
%108 = arith.cmpi slt, %107, %c32 : index
scf.if %108 {
%155 = vector.extract %85[11, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %107, %c0] : memref<1x1x32x1xi32>
}
%109 = affine.apply affine_map<(d0) -> (d0 + 12)>(%16)
%110 = arith.cmpi slt, %109, %c32 : index
scf.if %110 {
%155 = vector.extract %85[12, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %109, %c0] : memref<1x1x32x1xi32>
}
%111 = affine.apply affine_map<(d0) -> (d0 + 13)>(%16)
%112 = arith.cmpi slt, %111, %c32 : index
scf.if %112 {
%155 = vector.extract %85[13, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %111, %c0] : memref<1x1x32x1xi32>
}
%113 = affine.apply affine_map<(d0) -> (d0 + 14)>(%16)
%114 = arith.cmpi slt, %113, %c32 : index
scf.if %114 {
%155 = vector.extract %85[14, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %113, %c0] : memref<1x1x32x1xi32>
}
%115 = affine.apply affine_map<(d0) -> (d0 + 15)>(%16)
%116 = arith.cmpi slt, %115, %c32 : index
scf.if %116 {
%155 = vector.extract %85[15, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %115, %c0] : memref<1x1x32x1xi32>
}
%117 = affine.apply affine_map<(d0) -> (d0 + 16)>(%16)
%118 = arith.cmpi slt, %117, %c32 : index
scf.if %118 {
%155 = vector.extract %85[16, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %117, %c0] : memref<1x1x32x1xi32>
}
%119 = affine.apply affine_map<(d0) -> (d0 + 17)>(%16)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%155 = vector.extract %85[17, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
}
%121 = affine.apply affine_map<(d0) -> (d0 + 18)>(%16)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%155 = vector.extract %85[18, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
}
%123 = affine.apply affine_map<(d0) -> (d0 + 19)>(%16)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%155 = vector.extract %85[19, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
}
%125 = affine.apply affine_map<(d0) -> (d0 + 20)>(%16)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%155 = vector.extract %85[20, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
}
%127 = affine.apply affine_map<(d0) -> (d0 + 21)>(%16)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%155 = vector.extract %85[21, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
}
%129 = affine.apply affine_map<(d0) -> (d0 + 22)>(%16)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%155 = vector.extract %85[22, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
}
%131 = affine.apply affine_map<(d0) -> (d0 + 23)>(%16)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%155 = vector.extract %85[23, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
}
%133 = affine.apply affine_map<(d0) -> (d0 + 24)>(%16)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%155 = vector.extract %85[24, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
}
%135 = affine.apply affine_map<(d0) -> (d0 + 25)>(%16)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%155 = vector.extract %85[25, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
}
%137 = affine.apply affine_map<(d0) -> (d0 + 26)>(%16)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%155 = vector.extract %85[26, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
}
%139 = affine.apply affine_map<(d0) -> (d0 + 27)>(%16)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%155 = vector.extract %85[27, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
}
%141 = affine.apply affine_map<(d0) -> (d0 + 28)>(%16)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%155 = vector.extract %85[28, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
}
%143 = affine.apply affine_map<(d0) -> (d0 + 29)>(%16)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%155 = vector.extract %85[29, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
}
%145 = affine.apply affine_map<(d0) -> (d0 + 30)>(%16)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%155 = vector.extract %85[30, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
}
%147 = affine.apply affine_map<(d0) -> (d0 + 31)>(%16)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%155 = vector.extract %85[31, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
}
%149 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%150 = vector.broadcast %extracted : i32 to vector<1xi32>
%151 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%152 = vector.shuffle %150, %150 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%153 = arith.muli %149, %152 : vector<32xi32>
%154 = arith.addi %153, %151 : vector<32xi32>
vector.store %154, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%22 = arith.cmpi sgt, %21, %c0 : index
%23 = scf.if %22 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%24 = arith.cmpi sgt, %21, %c1 : index
%25 = scf.if %24 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %23 : vector<32x1xi32>
}
%26 = arith.cmpi sgt, %21, %c2 : index
%27 = scf.if %26 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %25 : vector<32x1xi32>
}
%28 = arith.cmpi sgt, %21, %c3 : index
%29 = scf.if %28 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %27 : vector<32x1xi32>
}
%30 = arith.cmpi sgt, %21, %c4 : index
%31 = scf.if %30 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %29 : vector<32x1xi32>
}
%32 = arith.cmpi sgt, %21, %c5 : index
%33 = scf.if %32 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %31 : vector<32x1xi32>
}
%34 = arith.cmpi sgt, %21, %c6 : index
%35 = scf.if %34 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %33 : vector<32x1xi32>
}
%36 = arith.cmpi sgt, %21, %c7 : index
%37 = scf.if %36 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %35 : vector<32x1xi32>
}
%38 = arith.cmpi sgt, %21, %c8 : index
%39 = scf.if %38 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %37 : vector<32x1xi32>
}
%40 = arith.cmpi sgt, %21, %c9 : index
%41 = scf.if %40 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %39 : vector<32x1xi32>
}
%42 = arith.cmpi sgt, %21, %c10 : index
%43 = scf.if %42 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %41 : vector<32x1xi32>
}
%44 = arith.cmpi sgt, %21, %c11 : index
%45 = scf.if %44 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %43 : vector<32x1xi32>
}
%46 = arith.cmpi sgt, %21, %c12 : index
%47 = scf.if %46 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %45 : vector<32x1xi32>
}
%48 = arith.cmpi sgt, %21, %c13 : index
%49 = scf.if %48 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %47 : vector<32x1xi32>
}
%50 = arith.cmpi sgt, %21, %c14 : index
%51 = scf.if %50 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %49 : vector<32x1xi32>
}
%52 = arith.cmpi sgt, %21, %c15 : index
%53 = scf.if %52 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %51 : vector<32x1xi32>
}
%54 = arith.cmpi sgt, %21, %c16 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %53 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %21, %c17 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %21, %c18 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %21, %c19 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %21, %c20 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %21, %c21 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %21, %c22 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %21, %c23 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %21, %c24 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %21, %c25 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %21, %c26 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %73 [26] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %73 : vector<32x1xi32>
}
%76 = arith.cmpi sgt, %21, %c27 : index
%77 = scf.if %76 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %75 [27] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %75 : vector<32x1xi32>
}
%78 = arith.cmpi sgt, %21, %c28 : index
%79 = scf.if %78 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %77 [28] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %77 : vector<32x1xi32>
}
%80 = arith.cmpi sgt, %21, %c29 : index
%81 = scf.if %80 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %79 [29] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %79 : vector<32x1xi32>
}
%82 = arith.cmpi sgt, %21, %c30 : index
%83 = scf.if %82 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %81 [30] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %81 : vector<32x1xi32>
}
%84 = arith.cmpi sgt, %21, %c31 : index
%85 = scf.if %84 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %83 [31] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %83 : vector<32x1xi32>
}
%86 = arith.cmpi slt, %16, %c32 : index
scf.if %86 {
%155 = vector.extract %85[0, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %16, %c0] : memref<1x1x32x1xi32>
}
%87 = affine.apply affine_map<(d0) -> (d0 + 1)>(%16)
%88 = arith.cmpi slt, %87, %c32 : index
scf.if %88 {
%155 = vector.extract %85[1, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %87, %c0] : memref<1x1x32x1xi32>
}
%89 = affine.apply affine_map<(d0) -> (d0 + 2)>(%16)
%90 = arith.cmpi slt, %89, %c32 : index
scf.if %90 {
%155 = vector.extract %85[2, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %89, %c0] : memref<1x1x32x1xi32>
}
%91 = affine.apply affine_map<(d0) -> (d0 + 3)>(%16)
%92 = arith.cmpi slt, %91, %c32 : index
scf.if %92 {
%155 = vector.extract %85[3, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %91, %c0] : memref<1x1x32x1xi32>
}
%93 = affine.apply affine_map<(d0) -> (d0 + 4)>(%16)
%94 = arith.cmpi slt, %93, %c32 : index
scf.if %94 {
%155 = vector.extract %85[4, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %93, %c0] : memref<1x1x32x1xi32>
}
%95 = affine.apply affine_map<(d0) -> (d0 + 5)>(%16)
%96 = arith.cmpi slt, %95, %c32 : index
scf.if %96 {
%155 = vector.extract %85[5, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %95, %c0] : memref<1x1x32x1xi32>
}
%97 = affine.apply affine_map<(d0) -> (d0 + 6)>(%16)
%98 = arith.cmpi slt, %97, %c32 : index
scf.if %98 {
%155 = vector.extract %85[6, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %97, %c0] : memref<1x1x32x1xi32>
}
%99 = affine.apply affine_map<(d0) -> (d0 + 7)>(%16)
%100 = arith.cmpi slt, %99, %c32 : index
scf.if %100 {
%155 = vector.extract %85[7, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %99, %c0] : memref<1x1x32x1xi32>
}
%101 = affine.apply affine_map<(d0) -> (d0 + 8)>(%16)
%102 = arith.cmpi slt, %101, %c32 : index
scf.if %102 {
%155 = vector.extract %85[8, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %101, %c0] : memref<1x1x32x1xi32>
}
%103 = affine.apply affine_map<(d0) -> (d0 + 9)>(%16)
%104 = arith.cmpi slt, %103, %c32 : index
scf.if %104 {
%155 = vector.extract %85[9, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %103, %c0] : memref<1x1x32x1xi32>
}
%105 = affine.apply affine_map<(d0) -> (d0 + 10)>(%16)
%106 = arith.cmpi slt, %105, %c32 : index
scf.if %106 {
%155 = vector.extract %85[10, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %105, %c0] : memref<1x1x32x1xi32>
}
%107 = affine.apply affine_map<(d0) -> (d0 + 11)>(%16)
%108 = arith.cmpi slt, %107, %c32 : index
scf.if %108 {
%155 = vector.extract %85[11, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %107, %c0] : memref<1x1x32x1xi32>
}
%109 = affine.apply affine_map<(d0) -> (d0 + 12)>(%16)
%110 = arith.cmpi slt, %109, %c32 : index
scf.if %110 {
%155 = vector.extract %85[12, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %109, %c0] : memref<1x1x32x1xi32>
}
%111 = affine.apply affine_map<(d0) -> (d0 + 13)>(%16)
%112 = arith.cmpi slt, %111, %c32 : index
scf.if %112 {
%155 = vector.extract %85[13, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %111, %c0] : memref<1x1x32x1xi32>
}
%113 = affine.apply affine_map<(d0) -> (d0 + 14)>(%16)
%114 = arith.cmpi slt, %113, %c32 : index
scf.if %114 {
%155 = vector.extract %85[14, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %113, %c0] : memref<1x1x32x1xi32>
}
%115 = affine.apply affine_map<(d0) -> (d0 + 15)>(%16)
%116 = arith.cmpi slt, %115, %c32 : index
scf.if %116 {
%155 = vector.extract %85[15, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %115, %c0] : memref<1x1x32x1xi32>
}
%117 = affine.apply affine_map<(d0) -> (d0 + 16)>(%16)
%118 = arith.cmpi slt, %117, %c32 : index
scf.if %118 {
%155 = vector.extract %85[16, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %117, %c0] : memref<1x1x32x1xi32>
}
%119 = affine.apply affine_map<(d0) -> (d0 + 17)>(%16)
%120 = arith.cmpi slt, %119, %c32 : index
scf.if %120 {
%155 = vector.extract %85[17, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32>
}
%121 = affine.apply affine_map<(d0) -> (d0 + 18)>(%16)
%122 = arith.cmpi slt, %121, %c32 : index
scf.if %122 {
%155 = vector.extract %85[18, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32>
}
%123 = affine.apply affine_map<(d0) -> (d0 + 19)>(%16)
%124 = arith.cmpi slt, %123, %c32 : index
scf.if %124 {
%155 = vector.extract %85[19, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32>
}
%125 = affine.apply affine_map<(d0) -> (d0 + 20)>(%16)
%126 = arith.cmpi slt, %125, %c32 : index
scf.if %126 {
%155 = vector.extract %85[20, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32>
}
%127 = affine.apply affine_map<(d0) -> (d0 + 21)>(%16)
%128 = arith.cmpi slt, %127, %c32 : index
scf.if %128 {
%155 = vector.extract %85[21, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32>
}
%129 = affine.apply affine_map<(d0) -> (d0 + 22)>(%16)
%130 = arith.cmpi slt, %129, %c32 : index
scf.if %130 {
%155 = vector.extract %85[22, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32>
}
%131 = affine.apply affine_map<(d0) -> (d0 + 23)>(%16)
%132 = arith.cmpi slt, %131, %c32 : index
scf.if %132 {
%155 = vector.extract %85[23, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32>
}
%133 = affine.apply affine_map<(d0) -> (d0 + 24)>(%16)
%134 = arith.cmpi slt, %133, %c32 : index
scf.if %134 {
%155 = vector.extract %85[24, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32>
}
%135 = affine.apply affine_map<(d0) -> (d0 + 25)>(%16)
%136 = arith.cmpi slt, %135, %c32 : index
scf.if %136 {
%155 = vector.extract %85[25, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32>
}
%137 = affine.apply affine_map<(d0) -> (d0 + 26)>(%16)
%138 = arith.cmpi slt, %137, %c32 : index
scf.if %138 {
%155 = vector.extract %85[26, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32>
}
%139 = affine.apply affine_map<(d0) -> (d0 + 27)>(%16)
%140 = arith.cmpi slt, %139, %c32 : index
scf.if %140 {
%155 = vector.extract %85[27, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32>
}
%141 = affine.apply affine_map<(d0) -> (d0 + 28)>(%16)
%142 = arith.cmpi slt, %141, %c32 : index
scf.if %142 {
%155 = vector.extract %85[28, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32>
}
%143 = affine.apply affine_map<(d0) -> (d0 + 29)>(%16)
%144 = arith.cmpi slt, %143, %c32 : index
scf.if %144 {
%155 = vector.extract %85[29, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32>
}
%145 = affine.apply affine_map<(d0) -> (d0 + 30)>(%16)
%146 = arith.cmpi slt, %145, %c32 : index
scf.if %146 {
%155 = vector.extract %85[30, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32>
}
%147 = affine.apply affine_map<(d0) -> (d0 + 31)>(%16)
%148 = arith.cmpi slt, %147, %c32 : index
scf.if %148 {
%155 = vector.extract %85[31, 0] : vector<32x1xi32>
memref.store %155, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32>
}
%149 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32>
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32>
%150 = vector.broadcast %extracted : i32 to vector<1xi32>
%151 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
%152 = vector.shuffle %150, %150 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32>
%153 = arith.muli %149, %152 : vector<32xi32>
%154 = arith.addi %153, %151 : vector<32xi32>
vector.store %154, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32>
}
}
}
return
}
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() {
%c0_i32 = arith.constant 0 : i32
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
%c9 = arith.constant 9 : index
%c10 = arith.constant 10 : index
%c11 = arith.constant 11 : index
%c12 = arith.constant 12 : index
%c13 = arith.constant 13 : index
%c14 = arith.constant 14 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c17 = arith.constant 17 : index
%c18 = arith.constant 18 : index
%c19 = arith.constant 19 : index
%c20 = arith.constant 20 : index
%c21 = arith.constant 21 : index
%c22 = arith.constant 22 : index
%c23 = arith.constant 23 : index
%c24 = arith.constant 24 : index
%c25 = arith.constant 25 : index
%c26 = arith.constant 26 : index
%c27 = arith.constant 27 : index
%c28 = arith.constant 28 : index
%c29 = arith.constant 29 : index
%c30 = arith.constant 30 : index
%c31 = arith.constant 31 : index
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c60 = arith.constant 60 : index
%c32 = arith.constant 32 : index
%c43 = arith.constant 43 : index
%cst_0 = arith.constant dense<0> : vector<32x1xi32>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32>
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32>
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x]
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x]
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5]
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x]
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6]
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>>
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>>
scf.for %arg0 = %c0 to %c60 step %c1 {
scf.for %arg1 = %c0 to %c64 step %c32 {
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4]
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4]
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6]
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4]
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6]
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12)
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>>
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>>
scf.for %arg2 = %c0 to %c43 step %c1 {
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2)
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10)
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12)
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10)
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18)
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32>
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32>
%22 = arith.cmpi sgt, %21, %c0 : index
%23 = scf.if %22 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %cst_0 : vector<32x1xi32>
}
%24 = arith.cmpi sgt, %21, %c1 : index
%25 = scf.if %24 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %23 : vector<32x1xi32>
}
%26 = arith.cmpi sgt, %21, %c2 : index
%27 = scf.if %26 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %25 : vector<32x1xi32>
}
%28 = arith.cmpi sgt, %21, %c3 : index
%29 = scf.if %28 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %27 : vector<32x1xi32>
}
%30 = arith.cmpi sgt, %21, %c4 : index
%31 = scf.if %30 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %29 : vector<32x1xi32>
}
%32 = arith.cmpi sgt, %21, %c5 : index
%33 = scf.if %32 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %31 : vector<32x1xi32>
}
%34 = arith.cmpi sgt, %21, %c6 : index
%35 = scf.if %34 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %33 : vector<32x1xi32>
}
%36 = arith.cmpi sgt, %21, %c7 : index
%37 = scf.if %36 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %35 : vector<32x1xi32>
}
%38 = arith.cmpi sgt, %21, %c8 : index
%39 = scf.if %38 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %37 : vector<32x1xi32>
}
%40 = arith.cmpi sgt, %21, %c9 : index
%41 = scf.if %40 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %39 : vector<32x1xi32>
}
%42 = arith.cmpi sgt, %21, %c10 : index
%43 = scf.if %42 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %41 : vector<32x1xi32>
}
%44 = arith.cmpi sgt, %21, %c11 : index
%45 = scf.if %44 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %43 : vector<32x1xi32>
}
%46 = arith.cmpi sgt, %21, %c12 : index
%47 = scf.if %46 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %45 : vector<32x1xi32>
}
%48 = arith.cmpi sgt, %21, %c13 : index
%49 = scf.if %48 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %47 : vector<32x1xi32>
}
%50 = arith.cmpi sgt, %21, %c14 : index
%51 = scf.if %50 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %49 : vector<32x1xi32>
}
%52 = arith.cmpi sgt, %21, %c15 : index
%53 = scf.if %52 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %51 : vector<32x1xi32>
}
%54 = arith.cmpi sgt, %21, %c16 : index
%55 = scf.if %54 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %53 : vector<32x1xi32>
}
%56 = arith.cmpi sgt, %21, %c17 : index
%57 = scf.if %56 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %55 : vector<32x1xi32>
}
%58 = arith.cmpi sgt, %21, %c18 : index
%59 = scf.if %58 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %57 : vector<32x1xi32>
}
%60 = arith.cmpi sgt, %21, %c19 : index
%61 = scf.if %60 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %59 : vector<32x1xi32>
}
%62 = arith.cmpi sgt, %21, %c20 : index
%63 = scf.if %62 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %61 : vector<32x1xi32>
}
%64 = arith.cmpi sgt, %21, %c21 : index
%65 = scf.if %64 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %63 : vector<32x1xi32>
}
%66 = arith.cmpi sgt, %21, %c22 : index
%67 = scf.if %66 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %65 : vector<32x1xi32>
}
%68 = arith.cmpi sgt, %21, %c23 : index
%69 = scf.if %68 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %67 : vector<32x1xi32>
}
%70 = arith.cmpi sgt, %21, %c24 : index
%71 = scf.if %70 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %69 : vector<32x1xi32>
}
%72 = arith.cmpi sgt, %21, %c25 : index
%73 = scf.if %72 -> (vector<32x1xi32>) {
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>
%156 = vector.broadcast %155 : i32 to vector<1xi32>
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32>
scf.yield %157 : vector<32x1xi32>
} else {
scf.yield %71 : vector<32x1xi32>
}
%74 = arith.cmpi sgt, %21, %c26 : index
%75 = scf.if %74 -> (vector<32x1xi32>) {
%155 =
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment