Created
March 14, 2023 16:43
-
-
Save banach-space/d2dba6befe4a127ded94e3a1c154f401 to your computer and use it in GitHub Desktop.
Generated using the pad_fusion branch of IREE
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}> { | |
hal.executable.export public @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%c30 = arith.constant 30 : index | |
%c18 = arith.constant 18 : index | |
%c1 = arith.constant 1 : index | |
hal.return %c30, %c18, %c1 : index, index, index | |
} | |
builtin.module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%7 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%8 = affine.min affine_map<(d0) -> (1920, d0)>(%7) | |
%9 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%10, %8) | |
%12 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%6, %10, %8) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %8, 0], sizes = [1, %c60, %11, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%padded = tensor.pad %13 low[0, 0, %6, 0] high[0, 0, %12, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32> | |
%14 = tensor.empty() : tensor<1x60x64x1xi32> | |
%cast = tensor.cast %14 : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32> | |
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After TileAndDecomposeAttention (iree-linalg-ext-tile-and-decompose-attention) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32> | |
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%cast = tensor.cast %6 : tensor<1x?x?x1xi32> to tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, %c60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x?x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x?x?x1xi32> to tensor<1x?x?x1xi32> | |
%cast_0 = tensor.cast %cast : tensor<1x60x64x1xi32> to tensor<1x?x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x?x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x?x?x1xi32>) -> tensor<1x?x?x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, %c60, %c64, 1], strides = [1, 1, 1, 1] : tensor<1x?x?x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDecomposeWinogradTransform (iree-linalg-ext-tile-and-decompose-winograd) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 106)>(%7, %11, %9) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%padded = tensor.pad %14 low[0, 0, %7, 0] high[0, 0, %13, 0] { | |
^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x60x?x1xi32> to tensor<1x60x?x1xi32> | |
%15 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
%16 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x60x?x1xi32>, tensor<1x43x1xi32>) outs(%15 : tensor<1x60x64x1xi32>) -> tensor<1x60x64x1xi32> | |
flow.dispatch.tensor.store %16, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyTileAndFusePass (iree-linalg-strategy-tile-and-fuse-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c2 = arith.constant 2 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c1 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c60 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg6 = %c0 to %c64 step %c32 iter_args(%arg7 = %arg5) -> (tensor<1x60x64x1xi32>) { | |
%17 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x60x64x1xi32>) { | |
%18 = affine.min affine_map<(d0) -> (d0, 1)>(%arg2) | |
%19 = affine.min affine_map<(d0) -> (d0 + 1, 1)>(%arg2) | |
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %18) | |
%21 = affine.min affine_map<(d0) -> (d0, 60)>(%arg4) | |
%22 = affine.min affine_map<(d0) -> (d0 + 1, 60)>(%arg4) | |
%23 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%22, %21) | |
%dim = tensor.dim %13, %c2 : tensor<1x60x?x1xi32> | |
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg6) | |
%25 = affine.max affine_map<(d0, d1) -> (d0 - d1, 0)>(%arg6, %7) | |
%26 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%25, %dim) | |
%27 = affine.max affine_map<(d0, d1) -> (d0 - d1 + 74, 0)>(%arg6, %7) | |
%28 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%27, %dim) | |
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26) | |
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%24, %28, %26) | |
%31 = affine.min affine_map<(d0) -> (d0, 1)>(%arg8) | |
%32 = affine.min affine_map<(d0) -> (d0 + 1, 1)>(%arg8) | |
%33 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%32, %31) | |
%extracted_slice = tensor.extract_slice %13[%18, %21, %26, %31] [%20, %23, %29, %33] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<?x?x?x?xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %24, 0] high[0, 0, %30, 0] { | |
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index): | |
tensor.yield %c0_i32 : i32 | |
} {__internal_linalg_transform__ = "1"} : tensor<?x?x?x?xi32> to tensor<?x?x?x?xi32> | |
%cast = tensor.cast %padded : tensor<?x?x?x?xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %cst[0, 0, %arg8] [1, 43, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x43x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%34 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%35 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%cast, %extracted_slice_0 : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%34 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %35 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %17 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgFuse (linalg-fuse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%padded, %cst : tensor<1x1x74x1xi32>, tensor<1x43x1xi32>) outs(%23 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyTilePass (iree-linalg-strategy-tile-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) { | |
%25 = scf.for %arg8 = %c0 to %c43 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x32x1xi32>) { | |
%extracted_slice_1 = tensor.extract_slice %padded[0, %arg6, %arg8, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %cst[%arg6, %arg8, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%26 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg9 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
scf.yield %26 : tensor<1x1x32x1xi32> | |
} | |
scf.yield %25 : tensor<1x1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) { | |
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg7 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
scf.yield %25 : tensor<1x1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) { | |
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "1", dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 60, 64, 0, 0, 0], [1, 1, 32, 1, 0, 0], [0, 0, 0, 0, 1, 1]]>, strides = dense<1> : tensor<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x1x32x1xi32>, tensor<1x1x1xi32>) outs(%arg7 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
scf.yield %25 : tensor<1x1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyDecomposePass (iree-linalg-strategy-decompose-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23) -> (tensor<1x1x32x1xi32>) { | |
%extracted_slice_1 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice_2[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %arg7[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%extracted_slice_5 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
%inserted_slice_6 = tensor.insert_slice %25 into %arg7[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
scf.yield %inserted_slice_6 : tensor<1x1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24:2 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %23, %arg8 = %extracted_slice_1) -> (tensor<1x1x32x1xi32>, tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg8 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %arg7, %25 : tensor<1x1x32x1xi32>, tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24#1 into %24#0[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %25 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %25 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %25 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %25 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%22 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 74)>(%16, %20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice low[0, 0, %16, 0] high[0, 0, %22, 0] { | |
^bb0(%arg6: index, %arg7: index, %arg8: index, %arg9: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x74x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, %arg6, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x74x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %extracted_slice_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%25 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %25 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%22 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6) | |
%25 = affine.max affine_map<(d0, d1) -> (d0 - d1, 0)>(%arg6, %16) | |
%26 = affine.min affine_map<(d0, d1, d2) -> (d0, d1 - d2)>(%25, %20, %18) | |
%27 = affine.max affine_map<(d0, d1) -> (d0 - d1 + 32, 0)>(%arg6, %16) | |
%28 = affine.min affine_map<(d0, d1, d2) -> (d0, d1 - d2)>(%27, %20, %18) | |
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26) | |
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 32)>(%24, %28, %26) | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice_3 low[0, 0, %24, 0] high[0, 0, %30, 0] { | |
^bb0(%arg8: index, %arg9: index, %arg10: index, %arg11: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%cast = tensor.cast %padded : tensor<1x1x?x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %cast[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%31 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %31 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShape (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_0 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%22 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_0 : tensor<1x1x32x1xi32>) -> tensor<1x1x32x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_1) -> (tensor<1x32x1xi32>) { | |
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6) | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %16) | |
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %20, %18) | |
%27 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %16) | |
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%27, %20, %18) | |
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26) | |
%30 = affine.apply affine_map<(d0, d1, d2) -> (-d0 - d1 + d2 + 32)>(%24, %28, %26) | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%padded = tensor.pad %extracted_slice_3 low[0, 0, %24, 0] high[0, 0, %30, 0] { | |
^bb0(%arg8: index, %arg9: index, %arg10: index, %arg11: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<1x1x?x1xi32> to tensor<1x1x32x1xi32> | |
%extracted_slice_4 = tensor.extract_slice %cst[0, %arg6, 0] [1, 1, 1] [1, 1, 1] : tensor<1x43x1xi32> to tensor<1x1x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %padded[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_4[0, 0, 0] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xi32> to tensor<1x1xi32> | |
%31 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x32x1xi32>, tensor<1x1xi32>) outs(%arg7 : tensor<1x32x1xi32>) -> tensor<1x32x1xi32> | |
scf.yield %31 : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_2 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_2 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyVectorizePass (iree-linalg-strategy-vectorize-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%6 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %9, 0], sizes = [1, 60, %12, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%14 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %6) -> (tensor<1x60x64x1xi32>) { | |
%15 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %11, %9) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %11, %9) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%extracted_slice = tensor.extract_slice %13[0, %arg2, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%22 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%23 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%24 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%16, %arg6) | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %16) | |
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %20, %18) | |
%27 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %16) | |
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%27, %20, %18) | |
%29 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%28, %26) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %26, 0] [1, 1, %29, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%30 = tensor.empty() : tensor<1x1x32x1xi32> | |
%31 = vector.transfer_write %cst, %30[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %24, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %23 into %22[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %15 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %14, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorizationExpert (linalg-vectorization-expert-driver) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<1x1x32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, false, true]} : tensor<1x1x?x1xi32>, vector<1x1x32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [true, true, false, true]} : vector<1x1x32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x43x1xi32>, vector<1x1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x32xi32>, vector<1x32xi32> | |
%37 = vector.extract %35[0] : vector<1x1xi32> | |
%38 = vector.shuffle %37, %37 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%39 = vector.broadcast %38 : vector<32xi32> to vector<1x32xi32> | |
%40 = arith.muli %34, %39 : vector<1x32xi32> | |
%41 = arith.addi %40, %36 : vector<1x32xi32> | |
%42 = vector.transfer_write %41, %collapsed_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %42 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After OptimizeVectorTransfer (iree-codegen-optimize-vector-transfer) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%38 = arith.muli %34, %37 : vector<32xi32> | |
%39 = arith.addi %38, %36 : vector<32xi32> | |
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = tensor.empty() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%38 = arith.muli %34, %37 : vector<32xi32> | |
%39 = arith.addi %38, %36 : vector<32xi32> | |
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%cst = arith.constant dense<0> : vector<32x1xi32> | |
%c43 = arith.constant 43 : index | |
%c32 = arith.constant 32 : index | |
%c60 = arith.constant 60 : index | |
%c64 = arith.constant 64 : index | |
%c1 = arith.constant 1 : index | |
%c1920 = arith.constant 1920 : index | |
%c1080 = arith.constant 1080 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
%6 = bufferization.alloc_tensor() : tensor<1x1x32x1xi32> | |
scf.for %arg0 = %2 to %c1080 step %3 { | |
scf.for %arg1 = %4 to %c1920 step %5 { | |
%7 = flow.dispatch.tensor.load %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x64x1xi32> | |
%8 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%9 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (1920, d0)>(%9) | |
%11 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%12 = affine.min affine_map<(d0) -> (1920, d0)>(%11) | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%12, %10) | |
%14 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %10, 0], sizes = [1, 60, %13, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1080x1920x1xi32>> -> tensor<1x60x?x1xi32> | |
%15 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %7) -> (tensor<1x60x64x1xi32>) { | |
%16 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (tensor<1x60x64x1xi32>) { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%8, %arg4) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %8) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %12, %10) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %8) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %12, %10) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%extracted_slice = tensor.extract_slice %14[0, %arg2, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : tensor<1x60x?x1xi32> to tensor<1x1x?x1xi32> | |
%extracted_slice_1 = tensor.extract_slice %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x60x64x1xi32> to tensor<1x1x32x1xi32> | |
%23 = vector.transfer_write %cst, %extracted_slice_1[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_2 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%24 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %extracted_slice_2) -> (tensor<1x32x1xi32>) { | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%17, %arg6) | |
%26 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %17) | |
%27 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%26, %21, %19) | |
%28 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %17) | |
%29 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%28, %21, %19) | |
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %27) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, 0, %27, 0] [1, 1, %30, 1] [1, 1, 1, 1] : tensor<1x1x?x1xi32> to tensor<1x1x?x1xi32> | |
%31 = vector.transfer_write %cst, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%32 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : tensor<1x1x?x1xi32>, vector<32x1xi32> | |
%33 = vector.transfer_write %32, %31[%c0, %c0, %25, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, tensor<1x1x32x1xi32> | |
%extracted_slice_5 = tensor.extract_slice %33[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> to tensor<1x32x1xi32> | |
%collapsed = tensor.collapse_shape %extracted_slice_5 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%collapsed_6 = tensor.collapse_shape %arg7 [[0], [1, 2]] : tensor<1x32x1xi32> into tensor<1x32xi32> | |
%34 = vector.transfer_read %collapsed[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%35 = vector.transfer_read %cst_0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x43x1xi32>, vector<1xi32> | |
%36 = vector.transfer_read %collapsed_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<1x32xi32>, vector<32xi32> | |
%37 = vector.shuffle %35, %35 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%38 = arith.muli %34, %37 : vector<32xi32> | |
%39 = arith.addi %38, %36 : vector<32xi32> | |
%40 = vector.transfer_write %39, %collapsed_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, tensor<1x32xi32> | |
%expanded = tensor.expand_shape %40 [[0], [1, 2]] : tensor<1x32xi32> into tensor<1x32x1xi32> | |
scf.yield %expanded : tensor<1x32x1xi32> | |
} | |
%inserted_slice = tensor.insert_slice %24 into %23[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x32x1xi32> into tensor<1x1x32x1xi32> | |
%inserted_slice_3 = tensor.insert_slice %inserted_slice into %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : tensor<1x1x32x1xi32> into tensor<1x60x64x1xi32> | |
scf.yield %inserted_slice_3 : tensor<1x60x64x1xi32> | |
} | |
scf.yield %16 : tensor<1x60x64x1xi32> | |
} | |
flow.dispatch.tensor.store %15, %1, offsets = [0, %arg0, %arg1, 0], sizes = [1, 60, 64, 1], strides = [1, 1, 1, 1] : tensor<1x60x64x1xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x1080x1920x1xi32>> | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %subview) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%14 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%15 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %11, %9) | |
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %17) | |
%subview_3 = memref.subview %subview_1[0, %arg2, %17, 0] [1, 1, %20, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%21 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%15, %arg6) | |
%23 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %15) | |
%24 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%23, %19, %17) | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %15) | |
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %19, %17) | |
%27 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%26, %24) | |
%subview_8 = memref.subview %subview_3[0, 0, %24, 0] [1, 1, %27, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%28 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %28, %alloca[%c0, %c0, %22, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_10 = memref.collapse_shape %arg7 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%29 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%30 = vector.transfer_read %0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%31 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%32 = vector.shuffle %30, %30 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%33 = arith.muli %29, %32 : vector<32xi32> | |
%34 = arith.addi %33, %31 : vector<32xi32> | |
vector.transfer_write %34, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.yield %arg7 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
%subview_7 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
scf.yield %arg5 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
scf.yield %14 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
module { | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = scf.for %arg2 = %c0 to %c60 step %c1 iter_args(%arg3 = %subview) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%14 = scf.for %arg4 = %c0 to %c64 step %c32 iter_args(%arg5 = %arg3) -> (memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%15 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg4) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg4, %7) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %11, %9) | |
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %17) | |
%subview_3 = memref.subview %subview_1[0, %arg2, %17, 0] [1, 1, %20, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%21 = scf.for %arg6 = %c0 to %c43 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%15, %arg6) | |
%23 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg6, %15) | |
%24 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%23, %19, %17) | |
%25 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg6, %15) | |
%26 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%25, %19, %17) | |
%27 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%26, %24) | |
%subview_8 = memref.subview %subview_3[0, 0, %24, 0] [1, 1, %27, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%28 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %28, %alloca[%c0, %c0, %22, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_10 = memref.collapse_shape %arg7 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%29 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%30 = vector.transfer_read %0[%c0, %arg6, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%31 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%32 = vector.shuffle %30, %30 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%33 = arith.muli %29, %32 : vector<32xi32> | |
%34 = arith.addi %33, %31 : vector<32xi32> | |
vector.transfer_write %34, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.yield %arg7 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
%subview_7 = memref.subview %arg5[0, %arg2, %arg4, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
scf.yield %arg5 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
scf.yield %14 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg2 = %c0 to %c60 step %c1 { | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15) | |
%subview_3 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg4 = %c0 to %c43 step %c1 { | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15) | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13) | |
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15) | |
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21) | |
%subview_8 = memref.subview %subview_3[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%25 = vector.transfer_read %subview_8[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_9 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_9 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_10 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%28 = vector.transfer_read %collapse_shape_10[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%30 = arith.muli %26, %29 : vector<32xi32> | |
%31 = arith.addi %30, %28 : vector<32xi32> | |
vector.transfer_write %31, %collapse_shape_10[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
%subview_6 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_5 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
%subview_7 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
%subview_2 = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_2 : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg2 = %c0 to %c60 step %c1 { | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15) | |
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg4 = %c0 to %c43 step %c1 { | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15) | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13) | |
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15) | |
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21) | |
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%30 = arith.muli %26, %29 : vector<32xi32> | |
%31 = arith.addi %30, %28 : vector<32xi32> | |
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_4 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_3 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg2 = %c0 to %c60 step %c1 { | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15) | |
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg4 = %c0 to %c43 step %c1 { | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15) | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13) | |
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15) | |
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21) | |
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%30 = arith.muli %26, %29 : vector<32xi32> | |
%31 = arith.addi %30, %28 : vector<32xi32> | |
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32, #hal.descriptor_type<storage_buffer>> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg2 = %c0 to %c60 step %c1 { | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15) | |
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
scf.for %arg4 = %c0 to %c43 step %c1 { | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15) | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13) | |
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15) | |
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21) | |
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32x1xi32> | |
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<32xi32> | |
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%30 = arith.muli %26, %29 : vector<32xi32> | |
%31 = arith.addi %30, %28 : vector<32xi32> | |
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1080 = arith.constant 1080 : index | |
%c1920 = arith.constant 1920 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_count_y] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] | |
scf.for %arg0 = %3 to %c1080 step %4 { | |
scf.for %arg1 = %5 to %c1920 step %6 { | |
%subview = memref.subview %2[0, %arg0, %arg1, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%7 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%arg1) | |
%8 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%arg1) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%arg1) | |
%11 = affine.min affine_map<(d0) -> (1920, d0)>(%10) | |
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %9) | |
%subview_1 = memref.subview %1[0, %arg0, %9, 0] [1, 60, %12, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c60 step %c1 { | |
scf.for %arg3 = %c0 to %c64 step %c32 { | |
%13 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%7, %arg3) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg3, %7) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %11, %9) | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg3, %7) | |
%17 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%16, %11, %9) | |
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %15) | |
%subview_2 = memref.subview %subview_1[0, %arg2, %15, 0] [1, 1, %18, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_3 = memref.subview %subview[0, %arg2, %arg3, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
scf.for %arg4 = %c0 to %c43 step %c1 { | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%13, %arg4) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg4, %13) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %17, %15) | |
%22 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg4, %13) | |
%23 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%22, %17, %15) | |
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %21) | |
%subview_5 = memref.subview %subview_2[0, 0, %21, 0] [1, 1, %24, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%25 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %25, %alloca[%c0, %c0, %19, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
%26 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%27 = vector.transfer_read %0[%c0, %arg4, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%28 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%29 = vector.shuffle %27, %27 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%30 = arith.muli %26, %29 : vector<32xi32> | |
%31 = arith.addi %30, %28 : vector<32xi32> | |
vector.transfer_write %31, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%4) | |
%6 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%4) | |
%7 = affine.min affine_map<(d0) -> (1920, d0)>(%6) | |
%8 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%4) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%9, %7) | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%5, %arg1) | |
%12 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg1, %5) | |
%13 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%12, %9, %7) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg1, %5) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %9, %7) | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_2 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_3 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_5 = memref.subview %subview_2[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<(d0) -> (-d0 + 21, 0)>(%4) | |
%6 = affine.max affine_map<(d0) -> (0, d0 - 21)>(%4) | |
%7 = affine.min affine_map<(d0) -> (1920, d0)>(%6) | |
%8 = affine.max affine_map<(d0) -> (0, d0 + 85)>(%4) | |
%9 = affine.min affine_map<(d0) -> (1920, d0)>(%8) | |
%10 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%9, %7) | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%5, %arg1) | |
%12 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg1, %5) | |
%13 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%12, %9, %7) | |
%14 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 74)>(%arg1, %5) | |
%15 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%14, %9, %7) | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_2 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_3 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_3[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_5 = memref.subview %subview_2[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_5[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%subview_6 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_6 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%collapse_shape_7 = memref.collapse_shape %subview_4 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_7[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_7[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.transfer_read %collapse_shape[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = vector.transfer_read %0[%c0, %arg2, %c0], %c0_i32 {in_bounds = [true]} : memref<1x43x1xi32>, vector<1xi32> | |
%26 = vector.transfer_read %collapse_shape_6[%c0, %c0], %c0_i32 {in_bounds = [true]} : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%27 = vector.shuffle %25, %25 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%28 = arith.muli %24, %27 : vector<32xi32> | |
%29 = arith.addi %28, %26 : vector<32xi32> | |
vector.transfer_write %29, %collapse_shape_6[%c0, %c0] {in_bounds = [true]} : vector<32xi32>, memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = bufferization.to_memref %cst : memref<1x43x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %2, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %2[0, %3, %4, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%5 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%6 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%7 = affine.min affine_map<()[s0] -> (1920, s0)>()[%6] | |
%8 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%9 = affine.min affine_map<()[s0] -> (1920, s0)>()[%8] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%9, %7] | |
%subview_1 = memref.subview %1[0, %3, %7, 0] [1, 60, %10, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%11 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%5] | |
%12 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%5] | |
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%12)[%9, %7] | |
%14 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%5] | |
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%14)[%9, %7] | |
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %13, 0] [1, 1, %16, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%11, %arg2) | |
%18 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %11) | |
%19 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%18, %15, %13) | |
%20 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %11) | |
%21 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%20, %15, %13) | |
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19) | |
%subview_7 = memref.subview %subview_3[0, 0, %19, 0] [1, 1, %22, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %23, %alloca[%c0, %c0, %17, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%24 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%25 = memref.load %0[%c0, %arg2, %c0] : memref<1x43x1xi32> | |
%26 = vector.broadcast %25 : i32 to vector<1xi32> | |
%27 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%28 = vector.shuffle %26, %26 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%29 = arith.muli %24, %28 : vector<32xi32> | |
%30 = arith.addi %29, %27 : vector<32xi32> | |
vector.store %30, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%22 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %22, %alloca[%c0, %c0, %16, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%24 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%25 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%26 = vector.shuffle %24, %24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%27 = arith.muli %23, %26 : vector<32xi32> | |
%28 = arith.addi %27, %25 : vector<32xi32> | |
vector.store %28, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
vector.transfer_write %cst_0, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%22 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [false, true]} : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>>, vector<32x1xi32> | |
vector.transfer_write %22, %alloca[%c0, %c0, %16, %c0] {in_bounds = [false, true]} : vector<32x1xi32>, memref<1x1x32x1xi32> | |
%23 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%24 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%25 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%26 = vector.shuffle %24, %24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%27 = arith.muli %23, %26 : vector<32xi32> | |
%28 = arith.addi %27, %25 : vector<32xi32> | |
vector.store %28, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%16 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%17 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
memref.store %17, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%18 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
memref.store %18, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%19 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
memref.store %19, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%20 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
memref.store %20, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%21 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
memref.store %21, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%22 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
memref.store %22, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%23 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
memref.store %23, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%24 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
memref.store %24, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%25 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
memref.store %25, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%26 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
memref.store %26, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%27 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
memref.store %27, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%28 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
memref.store %28, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%29 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
memref.store %29, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%30 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
memref.store %30, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%31 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
memref.store %31, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%32 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
memref.store %32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%33 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
memref.store %33, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%34 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
memref.store %34, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%35 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
memref.store %35, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%36 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
memref.store %36, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%37 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
memref.store %37, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%38 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
memref.store %38, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%39 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
memref.store %39, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%40 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
memref.store %40, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%41 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
memref.store %41, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%42 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
memref.store %42, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%43 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
memref.store %43, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%44 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
memref.store %44, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%45 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
memref.store %45, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%46 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
memref.store %46, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%47 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
memref.store %47, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %14, %12) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %14, %12) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%54 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
memref.store %54, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
%55 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
memref.store %55, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
%56 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
memref.store %56, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
%57 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
memref.store %57, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
%58 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
memref.store %58, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
%59 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
memref.store %59, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
%60 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
memref.store %60, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
%61 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
memref.store %61, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
%62 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
memref.store %62, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
%63 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
memref.store %63, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
%64 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
memref.store %64, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
%65 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
memref.store %65, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
%66 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
memref.store %66, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
%67 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
memref.store %67, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
%68 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
memref.store %68, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
%69 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
memref.store %69, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
%70 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
memref.store %70, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
%71 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
memref.store %71, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
%72 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
memref.store %72, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
%73 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
memref.store %73, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
%74 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
memref.store %74, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
%75 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
memref.store %75, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
%76 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
memref.store %76, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
%77 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
memref.store %77, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
%78 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
memref.store %78, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
%79 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
memref.store %79, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
%80 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
memref.store %80, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
%81 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
memref.store %81, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
%82 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
memref.store %82, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
%83 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
memref.store %83, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
%84 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
memref.store %84, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
%85 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
memref.store %85, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%86 = arith.cmpi sgt, %53, %c0 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c1 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %87 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c2 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %89 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c3 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %91 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c4 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %93 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c5 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %95 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c6 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %97 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c7 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %99 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c8 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %101 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c9 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %103 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c10 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %105 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c11 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %107 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c12 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %109 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c13 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %111 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c14 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %113 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c15 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %115 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi sgt, %53, %c16 : index | |
%119 = scf.if %118 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %117 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %117 : vector<32x1xi32> | |
} | |
%120 = arith.cmpi sgt, %53, %c17 : index | |
%121 = scf.if %120 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %119 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %119 : vector<32x1xi32> | |
} | |
%122 = arith.cmpi sgt, %53, %c18 : index | |
%123 = scf.if %122 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %121 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %121 : vector<32x1xi32> | |
} | |
%124 = arith.cmpi sgt, %53, %c19 : index | |
%125 = scf.if %124 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %123 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %123 : vector<32x1xi32> | |
} | |
%126 = arith.cmpi sgt, %53, %c20 : index | |
%127 = scf.if %126 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %125 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %125 : vector<32x1xi32> | |
} | |
%128 = arith.cmpi sgt, %53, %c21 : index | |
%129 = scf.if %128 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %127 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %127 : vector<32x1xi32> | |
} | |
%130 = arith.cmpi sgt, %53, %c22 : index | |
%131 = scf.if %130 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %129 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %129 : vector<32x1xi32> | |
} | |
%132 = arith.cmpi sgt, %53, %c23 : index | |
%133 = scf.if %132 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %131 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %131 : vector<32x1xi32> | |
} | |
%134 = arith.cmpi sgt, %53, %c24 : index | |
%135 = scf.if %134 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %133 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %133 : vector<32x1xi32> | |
} | |
%136 = arith.cmpi sgt, %53, %c25 : index | |
%137 = scf.if %136 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %135 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %135 : vector<32x1xi32> | |
} | |
%138 = arith.cmpi sgt, %53, %c26 : index | |
%139 = scf.if %138 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %137 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %137 : vector<32x1xi32> | |
} | |
%140 = arith.cmpi sgt, %53, %c27 : index | |
%141 = scf.if %140 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %139 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %139 : vector<32x1xi32> | |
} | |
%142 = arith.cmpi sgt, %53, %c28 : index | |
%143 = scf.if %142 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %141 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %141 : vector<32x1xi32> | |
} | |
%144 = arith.cmpi sgt, %53, %c29 : index | |
%145 = scf.if %144 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %143 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %143 : vector<32x1xi32> | |
} | |
%146 = arith.cmpi sgt, %53, %c30 : index | |
%147 = scf.if %146 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %145 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %145 : vector<32x1xi32> | |
} | |
%148 = arith.cmpi sgt, %53, %c31 : index | |
%149 = scf.if %148 -> (vector<32x1xi32>) { | |
%219 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%220 = vector.broadcast %219 : i32 to vector<1xi32> | |
%221 = vector.insert %220, %147 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %221 : vector<32x1xi32> | |
} else { | |
scf.yield %147 : vector<32x1xi32> | |
} | |
%150 = arith.cmpi slt, %48, %c32 : index | |
scf.if %150 { | |
%219 = vector.extract %149[0, 0] : vector<32x1xi32> | |
memref.store %219, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%220 = vector.extract %149[1, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%220 = vector.extract %149[2, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%220 = vector.extract %149[3, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%220 = vector.extract %149[4, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%220 = vector.extract %149[5, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%220 = vector.extract %149[6, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%220 = vector.extract %149[7, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%220 = vector.extract %149[8, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%220 = vector.extract %149[9, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%220 = vector.extract %149[10, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%220 = vector.extract %149[11, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%220 = vector.extract %149[12, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%220 = vector.extract %149[13, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%220 = vector.extract %149[14, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%220 = vector.extract %149[15, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%182 = arith.cmpi slt, %181, %c32 : index | |
scf.if %182 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%220 = vector.extract %149[16, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%183 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%184 = arith.cmpi slt, %183, %c32 : index | |
scf.if %184 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%220 = vector.extract %149[17, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%185 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%186 = arith.cmpi slt, %185, %c32 : index | |
scf.if %186 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%220 = vector.extract %149[18, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%187 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%188 = arith.cmpi slt, %187, %c32 : index | |
scf.if %188 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%220 = vector.extract %149[19, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%189 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%190 = arith.cmpi slt, %189, %c32 : index | |
scf.if %190 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%220 = vector.extract %149[20, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%191 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%192 = arith.cmpi slt, %191, %c32 : index | |
scf.if %192 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%220 = vector.extract %149[21, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%193 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%194 = arith.cmpi slt, %193, %c32 : index | |
scf.if %194 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%220 = vector.extract %149[22, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%195 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%196 = arith.cmpi slt, %195, %c32 : index | |
scf.if %196 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%220 = vector.extract %149[23, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%197 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%198 = arith.cmpi slt, %197, %c32 : index | |
scf.if %198 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%220 = vector.extract %149[24, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%199 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%200 = arith.cmpi slt, %199, %c32 : index | |
scf.if %200 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%220 = vector.extract %149[25, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%201 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%202 = arith.cmpi slt, %201, %c32 : index | |
scf.if %202 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%220 = vector.extract %149[26, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%203 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%204 = arith.cmpi slt, %203, %c32 : index | |
scf.if %204 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%220 = vector.extract %149[27, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%205 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%206 = arith.cmpi slt, %205, %c32 : index | |
scf.if %206 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%220 = vector.extract %149[28, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%207 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%208 = arith.cmpi slt, %207, %c32 : index | |
scf.if %208 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%220 = vector.extract %149[29, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%209 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%210 = arith.cmpi slt, %209, %c32 : index | |
scf.if %210 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%220 = vector.extract %149[30, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%211 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%212 = arith.cmpi slt, %211, %c32 : index | |
scf.if %212 { | |
%219 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%220 = vector.extract %149[31, 0] : vector<32x1xi32> | |
memref.store %220, %alloca[%c0, %c0, %219, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%213 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%214 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%215 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%216 = vector.shuffle %214, %214 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%217 = arith.muli %213, %216 : vector<32xi32> | |
%218 = arith.addi %217, %215 : vector<32xi32> | |
vector.store %218, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (iree-linalg-strategy-remove-markers-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyEnablePass (iree-linalg-strategy-enable-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgVectorLowering (linalg-vector-lowering) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
%10 = vector.extract %cst_0[0, 0] : vector<32x1xi32> | |
%11 = vector.extract %cst_0[1, 0] : vector<32x1xi32> | |
%12 = vector.extract %cst_0[2, 0] : vector<32x1xi32> | |
%13 = vector.extract %cst_0[3, 0] : vector<32x1xi32> | |
%14 = vector.extract %cst_0[4, 0] : vector<32x1xi32> | |
%15 = vector.extract %cst_0[5, 0] : vector<32x1xi32> | |
%16 = vector.extract %cst_0[6, 0] : vector<32x1xi32> | |
%17 = vector.extract %cst_0[7, 0] : vector<32x1xi32> | |
%18 = vector.extract %cst_0[8, 0] : vector<32x1xi32> | |
%19 = vector.extract %cst_0[9, 0] : vector<32x1xi32> | |
%20 = vector.extract %cst_0[10, 0] : vector<32x1xi32> | |
%21 = vector.extract %cst_0[11, 0] : vector<32x1xi32> | |
%22 = vector.extract %cst_0[12, 0] : vector<32x1xi32> | |
%23 = vector.extract %cst_0[13, 0] : vector<32x1xi32> | |
%24 = vector.extract %cst_0[14, 0] : vector<32x1xi32> | |
%25 = vector.extract %cst_0[15, 0] : vector<32x1xi32> | |
%26 = vector.extract %cst_0[16, 0] : vector<32x1xi32> | |
%27 = vector.extract %cst_0[17, 0] : vector<32x1xi32> | |
%28 = vector.extract %cst_0[18, 0] : vector<32x1xi32> | |
%29 = vector.extract %cst_0[19, 0] : vector<32x1xi32> | |
%30 = vector.extract %cst_0[20, 0] : vector<32x1xi32> | |
%31 = vector.extract %cst_0[21, 0] : vector<32x1xi32> | |
%32 = vector.extract %cst_0[22, 0] : vector<32x1xi32> | |
%33 = vector.extract %cst_0[23, 0] : vector<32x1xi32> | |
%34 = vector.extract %cst_0[24, 0] : vector<32x1xi32> | |
%35 = vector.extract %cst_0[25, 0] : vector<32x1xi32> | |
%36 = vector.extract %cst_0[26, 0] : vector<32x1xi32> | |
%37 = vector.extract %cst_0[27, 0] : vector<32x1xi32> | |
%38 = vector.extract %cst_0[28, 0] : vector<32x1xi32> | |
%39 = vector.extract %cst_0[29, 0] : vector<32x1xi32> | |
%40 = vector.extract %cst_0[30, 0] : vector<32x1xi32> | |
%41 = vector.extract %cst_0[31, 0] : vector<32x1xi32> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%42 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%43 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%44 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%43)[%8, %6] | |
%45 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%46 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%45)[%8, %6] | |
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %44) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %44, 0] [1, 1, %47, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %11, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %12, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %13, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %14, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %15, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %16, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %17, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %18, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %19, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %20, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %21, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %22, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %23, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %24, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %25, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %26, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %27, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %28, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %29, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %30, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %31, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %33, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %34, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %35, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %36, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %37, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %38, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %39, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %40, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %41, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%48 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%42, %arg2) | |
%49 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %42) | |
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%49, %46, %44) | |
%51 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %42) | |
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%51, %46, %44) | |
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %50) | |
%subview_7 = memref.subview %subview_3[0, 0, %50, 0] [1, 1, %53, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %10, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %11, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %12, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %13, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %14, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %15, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %16, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %17, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %18, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %19, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %20, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %21, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %22, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %23, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %24, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %25, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %26, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %27, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %28, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %29, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %30, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %31, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %33, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %34, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %35, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %36, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %37, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %38, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %39, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %40, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %41, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%54 = arith.cmpi sgt, %53, %c0 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %53, %c1 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %55 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %53, %c2 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %57 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %53, %c3 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %59 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %53, %c4 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %61 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %53, %c5 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %63 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %53, %c6 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %65 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %53, %c7 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %67 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %53, %c8 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %69 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %53, %c9 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %71 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %53, %c10 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %73 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %53, %c11 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %75 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %53, %c12 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %77 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %53, %c13 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %79 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %53, %c14 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %81 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %53, %c15 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %83 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi sgt, %53, %c16 : index | |
%87 = scf.if %86 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %85 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %85 : vector<32x1xi32> | |
} | |
%88 = arith.cmpi sgt, %53, %c17 : index | |
%89 = scf.if %88 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %87 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %87 : vector<32x1xi32> | |
} | |
%90 = arith.cmpi sgt, %53, %c18 : index | |
%91 = scf.if %90 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %89 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %89 : vector<32x1xi32> | |
} | |
%92 = arith.cmpi sgt, %53, %c19 : index | |
%93 = scf.if %92 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %91 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %91 : vector<32x1xi32> | |
} | |
%94 = arith.cmpi sgt, %53, %c20 : index | |
%95 = scf.if %94 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %93 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %93 : vector<32x1xi32> | |
} | |
%96 = arith.cmpi sgt, %53, %c21 : index | |
%97 = scf.if %96 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %95 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %95 : vector<32x1xi32> | |
} | |
%98 = arith.cmpi sgt, %53, %c22 : index | |
%99 = scf.if %98 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %97 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %97 : vector<32x1xi32> | |
} | |
%100 = arith.cmpi sgt, %53, %c23 : index | |
%101 = scf.if %100 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %99 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %99 : vector<32x1xi32> | |
} | |
%102 = arith.cmpi sgt, %53, %c24 : index | |
%103 = scf.if %102 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %101 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %101 : vector<32x1xi32> | |
} | |
%104 = arith.cmpi sgt, %53, %c25 : index | |
%105 = scf.if %104 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %103 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %103 : vector<32x1xi32> | |
} | |
%106 = arith.cmpi sgt, %53, %c26 : index | |
%107 = scf.if %106 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %105 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %105 : vector<32x1xi32> | |
} | |
%108 = arith.cmpi sgt, %53, %c27 : index | |
%109 = scf.if %108 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %107 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %107 : vector<32x1xi32> | |
} | |
%110 = arith.cmpi sgt, %53, %c28 : index | |
%111 = scf.if %110 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %109 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %109 : vector<32x1xi32> | |
} | |
%112 = arith.cmpi sgt, %53, %c29 : index | |
%113 = scf.if %112 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %111 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %111 : vector<32x1xi32> | |
} | |
%114 = arith.cmpi sgt, %53, %c30 : index | |
%115 = scf.if %114 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %113 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %113 : vector<32x1xi32> | |
} | |
%116 = arith.cmpi sgt, %53, %c31 : index | |
%117 = scf.if %116 -> (vector<32x1xi32>) { | |
%187 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%188 = vector.broadcast %187 : i32 to vector<1xi32> | |
%189 = vector.insert %188, %115 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %189 : vector<32x1xi32> | |
} else { | |
scf.yield %115 : vector<32x1xi32> | |
} | |
%118 = arith.cmpi slt, %48, %c32 : index | |
scf.if %118 { | |
%187 = vector.extract %117[0, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %48, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 1)>(%48) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%187 = vector.extract %117[1, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 2)>(%48) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%187 = vector.extract %117[2, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 3)>(%48) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%187 = vector.extract %117[3, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 4)>(%48) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%187 = vector.extract %117[4, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 5)>(%48) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%187 = vector.extract %117[5, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 6)>(%48) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%187 = vector.extract %117[6, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 7)>(%48) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%187 = vector.extract %117[7, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 8)>(%48) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%187 = vector.extract %117[8, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 9)>(%48) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%187 = vector.extract %117[9, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 10)>(%48) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%187 = vector.extract %117[10, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 11)>(%48) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%187 = vector.extract %117[11, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 12)>(%48) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%187 = vector.extract %117[12, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 13)>(%48) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%187 = vector.extract %117[13, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 14)>(%48) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%187 = vector.extract %117[14, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 15)>(%48) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%187 = vector.extract %117[15, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%149 = affine.apply affine_map<(d0) -> (d0 + 16)>(%48) | |
%150 = arith.cmpi slt, %149, %c32 : index | |
scf.if %150 { | |
%187 = vector.extract %117[16, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %149, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%151 = affine.apply affine_map<(d0) -> (d0 + 17)>(%48) | |
%152 = arith.cmpi slt, %151, %c32 : index | |
scf.if %152 { | |
%187 = vector.extract %117[17, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %151, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%153 = affine.apply affine_map<(d0) -> (d0 + 18)>(%48) | |
%154 = arith.cmpi slt, %153, %c32 : index | |
scf.if %154 { | |
%187 = vector.extract %117[18, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %153, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%155 = affine.apply affine_map<(d0) -> (d0 + 19)>(%48) | |
%156 = arith.cmpi slt, %155, %c32 : index | |
scf.if %156 { | |
%187 = vector.extract %117[19, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %155, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%157 = affine.apply affine_map<(d0) -> (d0 + 20)>(%48) | |
%158 = arith.cmpi slt, %157, %c32 : index | |
scf.if %158 { | |
%187 = vector.extract %117[20, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %157, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%159 = affine.apply affine_map<(d0) -> (d0 + 21)>(%48) | |
%160 = arith.cmpi slt, %159, %c32 : index | |
scf.if %160 { | |
%187 = vector.extract %117[21, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %159, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%161 = affine.apply affine_map<(d0) -> (d0 + 22)>(%48) | |
%162 = arith.cmpi slt, %161, %c32 : index | |
scf.if %162 { | |
%187 = vector.extract %117[22, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %161, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%163 = affine.apply affine_map<(d0) -> (d0 + 23)>(%48) | |
%164 = arith.cmpi slt, %163, %c32 : index | |
scf.if %164 { | |
%187 = vector.extract %117[23, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %163, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%165 = affine.apply affine_map<(d0) -> (d0 + 24)>(%48) | |
%166 = arith.cmpi slt, %165, %c32 : index | |
scf.if %166 { | |
%187 = vector.extract %117[24, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %165, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%167 = affine.apply affine_map<(d0) -> (d0 + 25)>(%48) | |
%168 = arith.cmpi slt, %167, %c32 : index | |
scf.if %168 { | |
%187 = vector.extract %117[25, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %167, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%169 = affine.apply affine_map<(d0) -> (d0 + 26)>(%48) | |
%170 = arith.cmpi slt, %169, %c32 : index | |
scf.if %170 { | |
%187 = vector.extract %117[26, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %169, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%171 = affine.apply affine_map<(d0) -> (d0 + 27)>(%48) | |
%172 = arith.cmpi slt, %171, %c32 : index | |
scf.if %172 { | |
%187 = vector.extract %117[27, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %171, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%173 = affine.apply affine_map<(d0) -> (d0 + 28)>(%48) | |
%174 = arith.cmpi slt, %173, %c32 : index | |
scf.if %174 { | |
%187 = vector.extract %117[28, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %173, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%175 = affine.apply affine_map<(d0) -> (d0 + 29)>(%48) | |
%176 = arith.cmpi slt, %175, %c32 : index | |
scf.if %176 { | |
%187 = vector.extract %117[29, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %175, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%177 = affine.apply affine_map<(d0) -> (d0 + 30)>(%48) | |
%178 = arith.cmpi slt, %177, %c32 : index | |
scf.if %178 { | |
%187 = vector.extract %117[30, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %177, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%179 = affine.apply affine_map<(d0) -> (d0 + 31)>(%48) | |
%180 = arith.cmpi slt, %179, %c32 : index | |
scf.if %180 { | |
%187 = vector.extract %117[31, 0] : vector<32x1xi32> | |
memref.store %187, %alloca[%c0, %c0, %179, %c0] : memref<1x1x32x1xi32> | |
} else { | |
} | |
%181 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%182 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%183 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%184 = vector.shuffle %182, %182 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%185 = arith.muli %181, %184 : vector<32xi32> | |
%186 = arith.addi %185, %183 : vector<32xi32> | |
vector.store %186, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%22 = arith.cmpi sgt, %21, %c0 : index | |
%23 = scf.if %22 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%24 = arith.cmpi sgt, %21, %c1 : index | |
%25 = scf.if %24 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %23 : vector<32x1xi32> | |
} | |
%26 = arith.cmpi sgt, %21, %c2 : index | |
%27 = scf.if %26 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %25 : vector<32x1xi32> | |
} | |
%28 = arith.cmpi sgt, %21, %c3 : index | |
%29 = scf.if %28 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %27 : vector<32x1xi32> | |
} | |
%30 = arith.cmpi sgt, %21, %c4 : index | |
%31 = scf.if %30 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %29 : vector<32x1xi32> | |
} | |
%32 = arith.cmpi sgt, %21, %c5 : index | |
%33 = scf.if %32 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %31 : vector<32x1xi32> | |
} | |
%34 = arith.cmpi sgt, %21, %c6 : index | |
%35 = scf.if %34 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %33 : vector<32x1xi32> | |
} | |
%36 = arith.cmpi sgt, %21, %c7 : index | |
%37 = scf.if %36 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %35 : vector<32x1xi32> | |
} | |
%38 = arith.cmpi sgt, %21, %c8 : index | |
%39 = scf.if %38 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %37 : vector<32x1xi32> | |
} | |
%40 = arith.cmpi sgt, %21, %c9 : index | |
%41 = scf.if %40 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %39 : vector<32x1xi32> | |
} | |
%42 = arith.cmpi sgt, %21, %c10 : index | |
%43 = scf.if %42 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %41 : vector<32x1xi32> | |
} | |
%44 = arith.cmpi sgt, %21, %c11 : index | |
%45 = scf.if %44 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %43 : vector<32x1xi32> | |
} | |
%46 = arith.cmpi sgt, %21, %c12 : index | |
%47 = scf.if %46 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %45 : vector<32x1xi32> | |
} | |
%48 = arith.cmpi sgt, %21, %c13 : index | |
%49 = scf.if %48 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %47 : vector<32x1xi32> | |
} | |
%50 = arith.cmpi sgt, %21, %c14 : index | |
%51 = scf.if %50 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %49 : vector<32x1xi32> | |
} | |
%52 = arith.cmpi sgt, %21, %c15 : index | |
%53 = scf.if %52 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %51 : vector<32x1xi32> | |
} | |
%54 = arith.cmpi sgt, %21, %c16 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %53 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %21, %c17 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %21, %c18 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %21, %c19 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %21, %c20 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %21, %c21 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %21, %c22 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %21, %c23 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %21, %c24 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %21, %c25 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %21, %c26 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %73 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %21, %c27 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %75 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %21, %c28 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %77 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %21, %c29 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %79 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %21, %c30 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %81 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %21, %c31 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %83 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi slt, %16, %c32 : index | |
scf.if %86 { | |
%155 = vector.extract %85[0, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %16, %c0] : memref<1x1x32x1xi32> | |
} | |
%87 = affine.apply affine_map<(d0) -> (d0 + 1)>(%16) | |
%88 = arith.cmpi slt, %87, %c32 : index | |
scf.if %88 { | |
%155 = vector.extract %85[1, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %87, %c0] : memref<1x1x32x1xi32> | |
} | |
%89 = affine.apply affine_map<(d0) -> (d0 + 2)>(%16) | |
%90 = arith.cmpi slt, %89, %c32 : index | |
scf.if %90 { | |
%155 = vector.extract %85[2, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %89, %c0] : memref<1x1x32x1xi32> | |
} | |
%91 = affine.apply affine_map<(d0) -> (d0 + 3)>(%16) | |
%92 = arith.cmpi slt, %91, %c32 : index | |
scf.if %92 { | |
%155 = vector.extract %85[3, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %91, %c0] : memref<1x1x32x1xi32> | |
} | |
%93 = affine.apply affine_map<(d0) -> (d0 + 4)>(%16) | |
%94 = arith.cmpi slt, %93, %c32 : index | |
scf.if %94 { | |
%155 = vector.extract %85[4, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %93, %c0] : memref<1x1x32x1xi32> | |
} | |
%95 = affine.apply affine_map<(d0) -> (d0 + 5)>(%16) | |
%96 = arith.cmpi slt, %95, %c32 : index | |
scf.if %96 { | |
%155 = vector.extract %85[5, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %95, %c0] : memref<1x1x32x1xi32> | |
} | |
%97 = affine.apply affine_map<(d0) -> (d0 + 6)>(%16) | |
%98 = arith.cmpi slt, %97, %c32 : index | |
scf.if %98 { | |
%155 = vector.extract %85[6, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %97, %c0] : memref<1x1x32x1xi32> | |
} | |
%99 = affine.apply affine_map<(d0) -> (d0 + 7)>(%16) | |
%100 = arith.cmpi slt, %99, %c32 : index | |
scf.if %100 { | |
%155 = vector.extract %85[7, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %99, %c0] : memref<1x1x32x1xi32> | |
} | |
%101 = affine.apply affine_map<(d0) -> (d0 + 8)>(%16) | |
%102 = arith.cmpi slt, %101, %c32 : index | |
scf.if %102 { | |
%155 = vector.extract %85[8, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %101, %c0] : memref<1x1x32x1xi32> | |
} | |
%103 = affine.apply affine_map<(d0) -> (d0 + 9)>(%16) | |
%104 = arith.cmpi slt, %103, %c32 : index | |
scf.if %104 { | |
%155 = vector.extract %85[9, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %103, %c0] : memref<1x1x32x1xi32> | |
} | |
%105 = affine.apply affine_map<(d0) -> (d0 + 10)>(%16) | |
%106 = arith.cmpi slt, %105, %c32 : index | |
scf.if %106 { | |
%155 = vector.extract %85[10, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %105, %c0] : memref<1x1x32x1xi32> | |
} | |
%107 = affine.apply affine_map<(d0) -> (d0 + 11)>(%16) | |
%108 = arith.cmpi slt, %107, %c32 : index | |
scf.if %108 { | |
%155 = vector.extract %85[11, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %107, %c0] : memref<1x1x32x1xi32> | |
} | |
%109 = affine.apply affine_map<(d0) -> (d0 + 12)>(%16) | |
%110 = arith.cmpi slt, %109, %c32 : index | |
scf.if %110 { | |
%155 = vector.extract %85[12, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %109, %c0] : memref<1x1x32x1xi32> | |
} | |
%111 = affine.apply affine_map<(d0) -> (d0 + 13)>(%16) | |
%112 = arith.cmpi slt, %111, %c32 : index | |
scf.if %112 { | |
%155 = vector.extract %85[13, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %111, %c0] : memref<1x1x32x1xi32> | |
} | |
%113 = affine.apply affine_map<(d0) -> (d0 + 14)>(%16) | |
%114 = arith.cmpi slt, %113, %c32 : index | |
scf.if %114 { | |
%155 = vector.extract %85[14, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %113, %c0] : memref<1x1x32x1xi32> | |
} | |
%115 = affine.apply affine_map<(d0) -> (d0 + 15)>(%16) | |
%116 = arith.cmpi slt, %115, %c32 : index | |
scf.if %116 { | |
%155 = vector.extract %85[15, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %115, %c0] : memref<1x1x32x1xi32> | |
} | |
%117 = affine.apply affine_map<(d0) -> (d0 + 16)>(%16) | |
%118 = arith.cmpi slt, %117, %c32 : index | |
scf.if %118 { | |
%155 = vector.extract %85[16, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %117, %c0] : memref<1x1x32x1xi32> | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 17)>(%16) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%155 = vector.extract %85[17, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 18)>(%16) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%155 = vector.extract %85[18, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 19)>(%16) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%155 = vector.extract %85[19, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 20)>(%16) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%155 = vector.extract %85[20, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 21)>(%16) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%155 = vector.extract %85[21, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 22)>(%16) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%155 = vector.extract %85[22, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 23)>(%16) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%155 = vector.extract %85[23, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 24)>(%16) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%155 = vector.extract %85[24, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 25)>(%16) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%155 = vector.extract %85[25, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 26)>(%16) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%155 = vector.extract %85[26, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 27)>(%16) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%155 = vector.extract %85[27, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 28)>(%16) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%155 = vector.extract %85[28, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 29)>(%16) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%155 = vector.extract %85[29, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 30)>(%16) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%155 = vector.extract %85[30, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 31)>(%16) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%155 = vector.extract %85[31, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} | |
%149 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%150 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%151 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%152 = vector.shuffle %150, %150 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%153 = arith.muli %149, %152 : vector<32xi32> | |
%154 = arith.addi %153, %151 : vector<32xi32> | |
vector.store %154, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%22 = arith.cmpi sgt, %21, %c0 : index | |
%23 = scf.if %22 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%24 = arith.cmpi sgt, %21, %c1 : index | |
%25 = scf.if %24 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %23 : vector<32x1xi32> | |
} | |
%26 = arith.cmpi sgt, %21, %c2 : index | |
%27 = scf.if %26 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %25 : vector<32x1xi32> | |
} | |
%28 = arith.cmpi sgt, %21, %c3 : index | |
%29 = scf.if %28 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %27 : vector<32x1xi32> | |
} | |
%30 = arith.cmpi sgt, %21, %c4 : index | |
%31 = scf.if %30 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %29 : vector<32x1xi32> | |
} | |
%32 = arith.cmpi sgt, %21, %c5 : index | |
%33 = scf.if %32 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %31 : vector<32x1xi32> | |
} | |
%34 = arith.cmpi sgt, %21, %c6 : index | |
%35 = scf.if %34 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %33 : vector<32x1xi32> | |
} | |
%36 = arith.cmpi sgt, %21, %c7 : index | |
%37 = scf.if %36 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %35 : vector<32x1xi32> | |
} | |
%38 = arith.cmpi sgt, %21, %c8 : index | |
%39 = scf.if %38 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %37 : vector<32x1xi32> | |
} | |
%40 = arith.cmpi sgt, %21, %c9 : index | |
%41 = scf.if %40 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %39 : vector<32x1xi32> | |
} | |
%42 = arith.cmpi sgt, %21, %c10 : index | |
%43 = scf.if %42 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %41 : vector<32x1xi32> | |
} | |
%44 = arith.cmpi sgt, %21, %c11 : index | |
%45 = scf.if %44 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %43 : vector<32x1xi32> | |
} | |
%46 = arith.cmpi sgt, %21, %c12 : index | |
%47 = scf.if %46 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %45 : vector<32x1xi32> | |
} | |
%48 = arith.cmpi sgt, %21, %c13 : index | |
%49 = scf.if %48 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %47 : vector<32x1xi32> | |
} | |
%50 = arith.cmpi sgt, %21, %c14 : index | |
%51 = scf.if %50 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %49 : vector<32x1xi32> | |
} | |
%52 = arith.cmpi sgt, %21, %c15 : index | |
%53 = scf.if %52 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %51 : vector<32x1xi32> | |
} | |
%54 = arith.cmpi sgt, %21, %c16 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %53 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %21, %c17 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %21, %c18 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %21, %c19 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %21, %c20 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %21, %c21 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %21, %c22 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %21, %c23 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %21, %c24 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %21, %c25 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %21, %c26 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c26, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %73 [26] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %73 : vector<32x1xi32> | |
} | |
%76 = arith.cmpi sgt, %21, %c27 : index | |
%77 = scf.if %76 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c27, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %75 [27] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %75 : vector<32x1xi32> | |
} | |
%78 = arith.cmpi sgt, %21, %c28 : index | |
%79 = scf.if %78 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c28, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %77 [28] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %77 : vector<32x1xi32> | |
} | |
%80 = arith.cmpi sgt, %21, %c29 : index | |
%81 = scf.if %80 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c29, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %79 [29] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %79 : vector<32x1xi32> | |
} | |
%82 = arith.cmpi sgt, %21, %c30 : index | |
%83 = scf.if %82 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c30, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %81 [30] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %81 : vector<32x1xi32> | |
} | |
%84 = arith.cmpi sgt, %21, %c31 : index | |
%85 = scf.if %84 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c31, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %83 [31] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %83 : vector<32x1xi32> | |
} | |
%86 = arith.cmpi slt, %16, %c32 : index | |
scf.if %86 { | |
%155 = vector.extract %85[0, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %16, %c0] : memref<1x1x32x1xi32> | |
} | |
%87 = affine.apply affine_map<(d0) -> (d0 + 1)>(%16) | |
%88 = arith.cmpi slt, %87, %c32 : index | |
scf.if %88 { | |
%155 = vector.extract %85[1, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %87, %c0] : memref<1x1x32x1xi32> | |
} | |
%89 = affine.apply affine_map<(d0) -> (d0 + 2)>(%16) | |
%90 = arith.cmpi slt, %89, %c32 : index | |
scf.if %90 { | |
%155 = vector.extract %85[2, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %89, %c0] : memref<1x1x32x1xi32> | |
} | |
%91 = affine.apply affine_map<(d0) -> (d0 + 3)>(%16) | |
%92 = arith.cmpi slt, %91, %c32 : index | |
scf.if %92 { | |
%155 = vector.extract %85[3, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %91, %c0] : memref<1x1x32x1xi32> | |
} | |
%93 = affine.apply affine_map<(d0) -> (d0 + 4)>(%16) | |
%94 = arith.cmpi slt, %93, %c32 : index | |
scf.if %94 { | |
%155 = vector.extract %85[4, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %93, %c0] : memref<1x1x32x1xi32> | |
} | |
%95 = affine.apply affine_map<(d0) -> (d0 + 5)>(%16) | |
%96 = arith.cmpi slt, %95, %c32 : index | |
scf.if %96 { | |
%155 = vector.extract %85[5, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %95, %c0] : memref<1x1x32x1xi32> | |
} | |
%97 = affine.apply affine_map<(d0) -> (d0 + 6)>(%16) | |
%98 = arith.cmpi slt, %97, %c32 : index | |
scf.if %98 { | |
%155 = vector.extract %85[6, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %97, %c0] : memref<1x1x32x1xi32> | |
} | |
%99 = affine.apply affine_map<(d0) -> (d0 + 7)>(%16) | |
%100 = arith.cmpi slt, %99, %c32 : index | |
scf.if %100 { | |
%155 = vector.extract %85[7, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %99, %c0] : memref<1x1x32x1xi32> | |
} | |
%101 = affine.apply affine_map<(d0) -> (d0 + 8)>(%16) | |
%102 = arith.cmpi slt, %101, %c32 : index | |
scf.if %102 { | |
%155 = vector.extract %85[8, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %101, %c0] : memref<1x1x32x1xi32> | |
} | |
%103 = affine.apply affine_map<(d0) -> (d0 + 9)>(%16) | |
%104 = arith.cmpi slt, %103, %c32 : index | |
scf.if %104 { | |
%155 = vector.extract %85[9, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %103, %c0] : memref<1x1x32x1xi32> | |
} | |
%105 = affine.apply affine_map<(d0) -> (d0 + 10)>(%16) | |
%106 = arith.cmpi slt, %105, %c32 : index | |
scf.if %106 { | |
%155 = vector.extract %85[10, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %105, %c0] : memref<1x1x32x1xi32> | |
} | |
%107 = affine.apply affine_map<(d0) -> (d0 + 11)>(%16) | |
%108 = arith.cmpi slt, %107, %c32 : index | |
scf.if %108 { | |
%155 = vector.extract %85[11, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %107, %c0] : memref<1x1x32x1xi32> | |
} | |
%109 = affine.apply affine_map<(d0) -> (d0 + 12)>(%16) | |
%110 = arith.cmpi slt, %109, %c32 : index | |
scf.if %110 { | |
%155 = vector.extract %85[12, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %109, %c0] : memref<1x1x32x1xi32> | |
} | |
%111 = affine.apply affine_map<(d0) -> (d0 + 13)>(%16) | |
%112 = arith.cmpi slt, %111, %c32 : index | |
scf.if %112 { | |
%155 = vector.extract %85[13, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %111, %c0] : memref<1x1x32x1xi32> | |
} | |
%113 = affine.apply affine_map<(d0) -> (d0 + 14)>(%16) | |
%114 = arith.cmpi slt, %113, %c32 : index | |
scf.if %114 { | |
%155 = vector.extract %85[14, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %113, %c0] : memref<1x1x32x1xi32> | |
} | |
%115 = affine.apply affine_map<(d0) -> (d0 + 15)>(%16) | |
%116 = arith.cmpi slt, %115, %c32 : index | |
scf.if %116 { | |
%155 = vector.extract %85[15, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %115, %c0] : memref<1x1x32x1xi32> | |
} | |
%117 = affine.apply affine_map<(d0) -> (d0 + 16)>(%16) | |
%118 = arith.cmpi slt, %117, %c32 : index | |
scf.if %118 { | |
%155 = vector.extract %85[16, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %117, %c0] : memref<1x1x32x1xi32> | |
} | |
%119 = affine.apply affine_map<(d0) -> (d0 + 17)>(%16) | |
%120 = arith.cmpi slt, %119, %c32 : index | |
scf.if %120 { | |
%155 = vector.extract %85[17, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %119, %c0] : memref<1x1x32x1xi32> | |
} | |
%121 = affine.apply affine_map<(d0) -> (d0 + 18)>(%16) | |
%122 = arith.cmpi slt, %121, %c32 : index | |
scf.if %122 { | |
%155 = vector.extract %85[18, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %121, %c0] : memref<1x1x32x1xi32> | |
} | |
%123 = affine.apply affine_map<(d0) -> (d0 + 19)>(%16) | |
%124 = arith.cmpi slt, %123, %c32 : index | |
scf.if %124 { | |
%155 = vector.extract %85[19, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %123, %c0] : memref<1x1x32x1xi32> | |
} | |
%125 = affine.apply affine_map<(d0) -> (d0 + 20)>(%16) | |
%126 = arith.cmpi slt, %125, %c32 : index | |
scf.if %126 { | |
%155 = vector.extract %85[20, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %125, %c0] : memref<1x1x32x1xi32> | |
} | |
%127 = affine.apply affine_map<(d0) -> (d0 + 21)>(%16) | |
%128 = arith.cmpi slt, %127, %c32 : index | |
scf.if %128 { | |
%155 = vector.extract %85[21, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %127, %c0] : memref<1x1x32x1xi32> | |
} | |
%129 = affine.apply affine_map<(d0) -> (d0 + 22)>(%16) | |
%130 = arith.cmpi slt, %129, %c32 : index | |
scf.if %130 { | |
%155 = vector.extract %85[22, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %129, %c0] : memref<1x1x32x1xi32> | |
} | |
%131 = affine.apply affine_map<(d0) -> (d0 + 23)>(%16) | |
%132 = arith.cmpi slt, %131, %c32 : index | |
scf.if %132 { | |
%155 = vector.extract %85[23, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %131, %c0] : memref<1x1x32x1xi32> | |
} | |
%133 = affine.apply affine_map<(d0) -> (d0 + 24)>(%16) | |
%134 = arith.cmpi slt, %133, %c32 : index | |
scf.if %134 { | |
%155 = vector.extract %85[24, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %133, %c0] : memref<1x1x32x1xi32> | |
} | |
%135 = affine.apply affine_map<(d0) -> (d0 + 25)>(%16) | |
%136 = arith.cmpi slt, %135, %c32 : index | |
scf.if %136 { | |
%155 = vector.extract %85[25, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %135, %c0] : memref<1x1x32x1xi32> | |
} | |
%137 = affine.apply affine_map<(d0) -> (d0 + 26)>(%16) | |
%138 = arith.cmpi slt, %137, %c32 : index | |
scf.if %138 { | |
%155 = vector.extract %85[26, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %137, %c0] : memref<1x1x32x1xi32> | |
} | |
%139 = affine.apply affine_map<(d0) -> (d0 + 27)>(%16) | |
%140 = arith.cmpi slt, %139, %c32 : index | |
scf.if %140 { | |
%155 = vector.extract %85[27, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %139, %c0] : memref<1x1x32x1xi32> | |
} | |
%141 = affine.apply affine_map<(d0) -> (d0 + 28)>(%16) | |
%142 = arith.cmpi slt, %141, %c32 : index | |
scf.if %142 { | |
%155 = vector.extract %85[28, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %141, %c0] : memref<1x1x32x1xi32> | |
} | |
%143 = affine.apply affine_map<(d0) -> (d0 + 29)>(%16) | |
%144 = arith.cmpi slt, %143, %c32 : index | |
scf.if %144 { | |
%155 = vector.extract %85[29, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %143, %c0] : memref<1x1x32x1xi32> | |
} | |
%145 = affine.apply affine_map<(d0) -> (d0 + 30)>(%16) | |
%146 = arith.cmpi slt, %145, %c32 : index | |
scf.if %146 { | |
%155 = vector.extract %85[30, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %145, %c0] : memref<1x1x32x1xi32> | |
} | |
%147 = affine.apply affine_map<(d0) -> (d0 + 31)>(%16) | |
%148 = arith.cmpi slt, %147, %c32 : index | |
scf.if %148 { | |
%155 = vector.extract %85[31, 0] : vector<32x1xi32> | |
memref.store %155, %alloca[%c0, %c0, %147, %c0] : memref<1x1x32x1xi32> | |
} | |
%149 = vector.load %collapse_shape[%c0, %c0] : memref<1x32xi32, strided<[32, 1]>>, vector<32xi32> | |
%extracted = tensor.extract %cst[%c0, %arg2, %c0] : tensor<1x43x1xi32> | |
%150 = vector.broadcast %extracted : i32 to vector<1xi32> | |
%151 = vector.load %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
%152 = vector.shuffle %150, %150 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<1xi32>, vector<1xi32> | |
%153 = arith.muli %149, %152 : vector<32xi32> | |
%154 = arith.addi %153, %151 : vector<32xi32> | |
vector.store %154, %collapse_shape_6[%c0, %c0] : memref<1x32xi32, strided<[2073600, 1], offset: ?>>, vector<32xi32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After LinalgStrategyLowerVectorsPass (iree-linalg-strategy-lower-vectors-pass) //----- // | |
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c5 = arith.constant 5 : index | |
%c6 = arith.constant 6 : index | |
%c7 = arith.constant 7 : index | |
%c8 = arith.constant 8 : index | |
%c9 = arith.constant 9 : index | |
%c10 = arith.constant 10 : index | |
%c11 = arith.constant 11 : index | |
%c12 = arith.constant 12 : index | |
%c13 = arith.constant 13 : index | |
%c14 = arith.constant 14 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c17 = arith.constant 17 : index | |
%c18 = arith.constant 18 : index | |
%c19 = arith.constant 19 : index | |
%c20 = arith.constant 20 : index | |
%c21 = arith.constant 21 : index | |
%c22 = arith.constant 22 : index | |
%c23 = arith.constant 23 : index | |
%c24 = arith.constant 24 : index | |
%c25 = arith.constant 25 : index | |
%c26 = arith.constant 26 : index | |
%c27 = arith.constant 27 : index | |
%c28 = arith.constant 28 : index | |
%c29 = arith.constant 29 : index | |
%c30 = arith.constant 30 : index | |
%c31 = arith.constant 31 : index | |
%cst = arith.constant dense<[[[157], [206], [268], [344], [436], [545], [673], [818], [982], [1163], [1358], [1566], [1780], [1997], [2210], [2413], [2600], [2763], [2897], [2996], [3057], [3078], [3057], [2996], [2897], [2763], [2600], [2413], [2210], [1997], [1780], [1566], [1358], [1163], [982], [818], [673], [545], [436], [344], [268], [206], [157]]]> : tensor<1x43x1xi32> | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c64 = arith.constant 64 : index | |
%c60 = arith.constant 60 : index | |
%c32 = arith.constant 32 : index | |
%c43 = arith.constant 43 : index | |
%cst_0 = arith.constant dense<0> : vector<32x1xi32> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x32x1xi32> | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %0, 64 : memref<1x1080x1920x1xi32> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1080x1920x1xi32> | |
memref.assume_alignment %1, 64 : memref<1x1080x1920x1xi32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%2 = affine.apply affine_map<()[s0] -> (s0 * 60)>()[%workgroup_id_y] | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] | |
%subview = memref.subview %1[0, %2, %3, 0] [1, 60, 64, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%4 = affine.max affine_map<()[s0] -> (s0 * -64 + 21, 0)>()[%workgroup_id_x] | |
%5 = affine.max affine_map<()[s0] -> (0, s0 * 64 - 21)>()[%workgroup_id_x] | |
%6 = affine.min affine_map<()[s0] -> (1920, s0)>()[%5] | |
%7 = affine.max affine_map<()[s0] -> (0, s0 * 64 + 85)>()[%workgroup_id_x] | |
%8 = affine.min affine_map<()[s0] -> (1920, s0)>()[%7] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 - s1)>()[%8, %6] | |
%subview_1 = memref.subview %0[0, %2, %6, 0] [1, 60, %9, 1] [1, 1, 1, 1] : memref<1x1080x1920x1xi32> to memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32> to memref<1x32x1xi32, strided<[32, 1, 1]>> | |
%collapse_shape = memref.collapse_shape %subview_2 [[0], [1, 2]] : memref<1x32x1xi32, strided<[32, 1, 1]>> into memref<1x32xi32, strided<[32, 1]>> | |
scf.for %arg0 = %c0 to %c60 step %c1 { | |
scf.for %arg1 = %c0 to %c64 step %c32 { | |
%10 = affine.max affine_map<(d0)[s0] -> (-d0 + s0, 0)>(%arg1)[%4] | |
%11 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%arg1)[%4] | |
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%11)[%8, %6] | |
%13 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0 + 74)>(%arg1)[%4] | |
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0 - s1, d0)>(%13)[%8, %6] | |
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%14, %12) | |
%subview_3 = memref.subview %subview_1[0, %arg0, %12, 0] [1, 1, %15, 1] [1, 1, 1, 1] : memref<1x60x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_4 = memref.subview %subview[0, %arg0, %arg1, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x60x64x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %subview_4[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%subview_5 = memref.subview %subview_4[0, 0, 0, 0] [1, 1, 32, 1] [1, 1, 1, 1] : memref<1x1x32x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> | |
%collapse_shape_6 = memref.collapse_shape %subview_5 [[0], [1, 2]] : memref<1x32x1xi32, strided<[2073600, 1, 1], offset: ?>> into memref<1x32xi32, strided<[2073600, 1], offset: ?>> | |
scf.for %arg2 = %c0 to %c43 step %c1 { | |
%16 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%10, %arg2) | |
%17 = affine.max affine_map<(d0, d1) -> (0, d0 - d1)>(%arg2, %10) | |
%18 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%17, %14, %12) | |
%19 = affine.max affine_map<(d0, d1) -> (0, d0 - d1 + 32)>(%arg2, %10) | |
%20 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%19, %14, %12) | |
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%20, %18) | |
%subview_7 = memref.subview %subview_3[0, 0, %18, 0] [1, 1, %21, 1] [1, 1, 1, 1] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> to memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c1, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c2, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c3, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c4, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c5, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c6, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c7, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c8, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c9, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c10, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c11, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c12, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c13, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c14, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c15, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c16, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c17, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c18, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c19, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c20, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c21, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c22, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c23, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c24, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c25, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c26, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c27, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c28, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c29, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c30, %c0] : memref<1x1x32x1xi32> | |
memref.store %c0_i32, %alloca[%c0, %c0, %c31, %c0] : memref<1x1x32x1xi32> | |
%22 = arith.cmpi sgt, %21, %c0 : index | |
%23 = scf.if %22 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c0, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %cst_0 [0] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %cst_0 : vector<32x1xi32> | |
} | |
%24 = arith.cmpi sgt, %21, %c1 : index | |
%25 = scf.if %24 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c1, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %23 [1] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %23 : vector<32x1xi32> | |
} | |
%26 = arith.cmpi sgt, %21, %c2 : index | |
%27 = scf.if %26 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c2, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %25 [2] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %25 : vector<32x1xi32> | |
} | |
%28 = arith.cmpi sgt, %21, %c3 : index | |
%29 = scf.if %28 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c3, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %27 [3] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %27 : vector<32x1xi32> | |
} | |
%30 = arith.cmpi sgt, %21, %c4 : index | |
%31 = scf.if %30 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c4, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %29 [4] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %29 : vector<32x1xi32> | |
} | |
%32 = arith.cmpi sgt, %21, %c5 : index | |
%33 = scf.if %32 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c5, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %31 [5] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %31 : vector<32x1xi32> | |
} | |
%34 = arith.cmpi sgt, %21, %c6 : index | |
%35 = scf.if %34 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c6, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %33 [6] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %33 : vector<32x1xi32> | |
} | |
%36 = arith.cmpi sgt, %21, %c7 : index | |
%37 = scf.if %36 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c7, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %35 [7] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %35 : vector<32x1xi32> | |
} | |
%38 = arith.cmpi sgt, %21, %c8 : index | |
%39 = scf.if %38 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c8, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %37 [8] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %37 : vector<32x1xi32> | |
} | |
%40 = arith.cmpi sgt, %21, %c9 : index | |
%41 = scf.if %40 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c9, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %39 [9] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %39 : vector<32x1xi32> | |
} | |
%42 = arith.cmpi sgt, %21, %c10 : index | |
%43 = scf.if %42 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c10, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %41 [10] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %41 : vector<32x1xi32> | |
} | |
%44 = arith.cmpi sgt, %21, %c11 : index | |
%45 = scf.if %44 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c11, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %43 [11] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %43 : vector<32x1xi32> | |
} | |
%46 = arith.cmpi sgt, %21, %c12 : index | |
%47 = scf.if %46 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c12, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %45 [12] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %45 : vector<32x1xi32> | |
} | |
%48 = arith.cmpi sgt, %21, %c13 : index | |
%49 = scf.if %48 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c13, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %47 [13] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %47 : vector<32x1xi32> | |
} | |
%50 = arith.cmpi sgt, %21, %c14 : index | |
%51 = scf.if %50 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c14, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %49 [14] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %49 : vector<32x1xi32> | |
} | |
%52 = arith.cmpi sgt, %21, %c15 : index | |
%53 = scf.if %52 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c15, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %51 [15] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %51 : vector<32x1xi32> | |
} | |
%54 = arith.cmpi sgt, %21, %c16 : index | |
%55 = scf.if %54 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c16, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %53 [16] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %53 : vector<32x1xi32> | |
} | |
%56 = arith.cmpi sgt, %21, %c17 : index | |
%57 = scf.if %56 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c17, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %55 [17] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %55 : vector<32x1xi32> | |
} | |
%58 = arith.cmpi sgt, %21, %c18 : index | |
%59 = scf.if %58 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c18, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %57 [18] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %57 : vector<32x1xi32> | |
} | |
%60 = arith.cmpi sgt, %21, %c19 : index | |
%61 = scf.if %60 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c19, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %59 [19] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %59 : vector<32x1xi32> | |
} | |
%62 = arith.cmpi sgt, %21, %c20 : index | |
%63 = scf.if %62 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c20, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %61 [20] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %61 : vector<32x1xi32> | |
} | |
%64 = arith.cmpi sgt, %21, %c21 : index | |
%65 = scf.if %64 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c21, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %63 [21] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %63 : vector<32x1xi32> | |
} | |
%66 = arith.cmpi sgt, %21, %c22 : index | |
%67 = scf.if %66 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c22, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %65 [22] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %65 : vector<32x1xi32> | |
} | |
%68 = arith.cmpi sgt, %21, %c23 : index | |
%69 = scf.if %68 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c23, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %67 [23] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %67 : vector<32x1xi32> | |
} | |
%70 = arith.cmpi sgt, %21, %c24 : index | |
%71 = scf.if %70 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c24, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %69 [24] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %69 : vector<32x1xi32> | |
} | |
%72 = arith.cmpi sgt, %21, %c25 : index | |
%73 = scf.if %72 -> (vector<32x1xi32>) { | |
%155 = memref.load %subview_7[%c0, %c0, %c25, %c0] : memref<1x1x?x1xi32, strided<[2073600, 1920, 1, 1], offset: ?>> | |
%156 = vector.broadcast %155 : i32 to vector<1xi32> | |
%157 = vector.insert %156, %71 [25] : vector<1xi32> into vector<32x1xi32> | |
scf.yield %157 : vector<32x1xi32> | |
} else { | |
scf.yield %71 : vector<32x1xi32> | |
} | |
%74 = arith.cmpi sgt, %21, %c26 : index | |
%75 = scf.if %74 -> (vector<32x1xi32>) { | |
%155 = |
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment