pashu123 · September 18, 2024 10:00
diff --git a/test.mlir b/test.mlir
 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c64 = arith.constant 64 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35 = flow.dispatch.workload.ordinal %14, 0 : index
  %36 = flow.dispatch.workload.ordinal %19, 1 : index
  %37 = flow.dispatch.workload.ordinal %24, 2 : index
  %38 = flow.dispatch.workload.ordinal %29, 3 : index
  %39 = flow.dispatch.workload.ordinal %34, 4 : index
  %40 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36}
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c64) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39}
  %42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32>
  %43 = flow.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} -> tensor<?x?x?x?xi32>
  %44 = scf.forall (%arg0, %arg1) = (0, 0) to (%37, %38) step (64, 64) shared_outs(%arg2 = %43) -> (tensor<?x?x?x?xi32>) {
    %45 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%37]
    %46 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg1)[%38]
    %47 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg0)[%39]
    %48 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%45)[%39]
    %49 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg1)[%39]
    %50 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%46)[%39]
    %extracted_slice = tensor.extract_slice %42[%47, %49] [%48, %50] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %dim = tensor.dim %43, %c2 : tensor<?x?x?x?xi32>
    %dim_0 = tensor.dim %43, %c3 : tensor<?x?x?x?xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [%45, %46, %dim, %dim_0] [1, 1, 1, 1] : tensor<?x?x?x?xi32> to tensor<?x?x?x?xi32>
    %51 = scf.for %arg3 = %c0 to %45 step %c1 iter_args(%arg4 = %extracted_slice_1) -> (tensor<?x?x?x?xi32>) {
      %52 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg3)[%39]
      %53 = scf.for %arg5 = %c0 to %46 step %c1 iter_args(%arg6 = %arg4) -> (tensor<?x?x?x?xi32>) {
        %54 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg5)[%39]
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[%52, %54] [%39, %39] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
        %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5, 0, 0] [1, 1, %dim, %dim_0] [1, 1, 1, 1] : tensor<?x?x?x?xi32> to tensor<1x1x?x?xi32>
        %pack = tensor.pack %extracted_slice_2 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %extracted_slice_3 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [1, 1]]>} : tensor<?x?xi32> -> tensor<1x1x?x?xi32>
        %inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, %arg5, 0, 0] [1, 1, %dim, %dim_0] [1, 1, 1, 1] : tensor<1x1x?x?xi32> into tensor<?x?x?x?xi32>
        scf.yield %inserted_slice : tensor<?x?x?x?xi32>
      }
      scf.yield %53 : tensor<?x?x?x?xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %51 into %arg2[%arg0, %arg1, 0, 0] [%45, %46, %39, %39] [1, 1, 1, 1] : tensor<?x?x?x?xi32> into tensor<?x?x?x?xi32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39}
  return
 }
	// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
	func.func @_fully_dynamic_pack_simple_dispatch_0_pack_i32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
	%c1 = arith.constant 1 : index
	%c0 = arith.constant 0 : index
	%c3 = arith.constant 3 : index
	%c2 = arith.constant 2 : index
	%c64 = arith.constant 64 : index
	%c32_i64 = arith.constant 32 : i64
	%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
	%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
	%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
	%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
	%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
	%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
	%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
	%10 = arith.extui %0 : i32 to i64
	%11 = arith.extui %1 : i32 to i64
	%12 = arith.shli %11, %c32_i64 : i64
	%13 = arith.ori %10, %12 : i64
	%14 = arith.index_castui %13 : i64 to index
	%15 = arith.extui %2 : i32 to i64
	%16 = arith.extui %3 : i32 to i64
	%17 = arith.shli %16, %c32_i64 : i64
	%18 = arith.ori %15, %17 : i64
	%19 = arith.index_castui %18 : i64 to index
	%20 = arith.extui %4 : i32 to i64
	%21 = arith.extui %5 : i32 to i64
	%22 = arith.shli %21, %c32_i64 : i64
	%23 = arith.ori %20, %22 : i64
	%24 = arith.index_castui %23 : i64 to index
	%25 = arith.extui %6 : i32 to i64
	%26 = arith.extui %7 : i32 to i64
	%27 = arith.shli %26, %c32_i64 : i64
	%28 = arith.ori %25, %27 : i64
	%29 = arith.index_castui %28 : i64 to index
	%30 = arith.extui %8 : i32 to i64
	%31 = arith.extui %9 : i32 to i64
	%32 = arith.shli %31, %c32_i64 : i64
	%33 = arith.ori %30, %32 : i64
	%34 = arith.index_castui %33 : i64 to index
	%35 = flow.dispatch.workload.ordinal %14, 0 : index
	%36 = flow.dispatch.workload.ordinal %19, 1 : index
	%37 = flow.dispatch.workload.ordinal %24, 2 : index
	%38 = flow.dispatch.workload.ordinal %29, 3 : index
	%39 = flow.dispatch.workload.ordinal %34, 4 : index
	%40 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36}
	%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c64) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39}
	%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%35, %36], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%35, %36} -> tensor<?x?xi32>
	%43 = flow.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39} -> tensor<?x?x?x?xi32>
	%44 = scf.forall (%arg0, %arg1) = (0, 0) to (%37, %38) step (64, 64) shared_outs(%arg2 = %43) -> (tensor<?x?x?x?xi32>) {
	%45 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%37]
	%46 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg1)[%38]
	%47 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg0)[%39]
	%48 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%45)[%39]
	%49 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg1)[%39]
	%50 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%46)[%39]
	%extracted_slice = tensor.extract_slice %42[%47, %49] [%48, %50] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
	%dim = tensor.dim %43, %c2 : tensor<?x?x?x?xi32>
	%dim_0 = tensor.dim %43, %c3 : tensor<?x?x?x?xi32>
	%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [%45, %46, %dim, %dim_0] [1, 1, 1, 1] : tensor<?x?x?x?xi32> to tensor<?x?x?x?xi32>
	%51 = scf.for %arg3 = %c0 to %45 step %c1 iter_args(%arg4 = %extracted_slice_1) -> (tensor<?x?x?x?xi32>) {
	%52 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg3)[%39]
	%53 = scf.for %arg5 = %c0 to %46 step %c1 iter_args(%arg6 = %arg4) -> (tensor<?x?x?x?xi32>) {
	%54 = affine.apply affine_map<(d0)[s0] -> (d0 * s0)>(%arg5)[%39]
	%extracted_slice_2 = tensor.extract_slice %extracted_slice[%52, %54] [%39, %39] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
	%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5, 0, 0] [1, 1, %dim, %dim_0] [1, 1, 1, 1] : tensor<?x?x?x?xi32> to tensor<1x1x?x?xi32>
	%pack = tensor.pack %extracted_slice_2 inner_dims_pos = [0, 1] inner_tiles = [%39, %39] into %extracted_slice_3 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [1, 1]]>} : tensor<?x?xi32> -> tensor<1x1x?x?xi32>
	%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, %arg5, 0, 0] [1, 1, %dim, %dim_0] [1, 1, 1, 1] : tensor<1x1x?x?xi32> into tensor<?x?x?x?xi32>
	scf.yield %inserted_slice : tensor<?x?x?x?xi32>
	}
	scf.yield %53 : tensor<?x?x?x?xi32>
	}
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %51 into %arg2[%arg0, %arg1, 0, 0] [%45, %46, %39, %39] [1, 1, 1, 1] : tensor<?x?x?x?xi32> into tensor<?x?x?x?xi32>
	}
	} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
	flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0], sizes = [%37, %38, %39, %39], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xi32>>{%37, %38, %39, %39}
	return
	}