pashu123 · October 4, 2024 16:10
diff --git a/inp.mlir b/inp.mlir
 func.func @prefill_bs4$async_dispatch_20_pack_f16() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f16
  %0 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = arith.extui %0 : i32 to i64
  %9 = arith.extui %1 : i32 to i64
  %10 = arith.shli %9, %c32_i64 : i64
  %11 = arith.ori %8, %10 : i64
  %12 = arith.index_castui %11 : i64 to index
  %13 = arith.extui %2 : i32 to i64
  %14 = arith.extui %3 : i32 to i64
  %15 = arith.shli %14, %c32_i64 : i64
  %16 = arith.ori %13, %15 : i64
  %17 = arith.index_castui %16 : i64 to index
  %18 = arith.extui %4 : i32 to i64
  %19 = arith.extui %5 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %6 : i32 to i64
  %24 = arith.extui %7 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = flow.dispatch.workload.ordinal %22, 0 : index
  %29 = flow.dispatch.workload.ordinal %27, 1 : index
  %30 = hal.interface.binding.subspan layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%12) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x?x100xf16>>{%28}
  %31 = hal.interface.binding.subspan layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%17) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x?x100x16x1xf16>>{%29}
  %32 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0], sizes = [128, %28, 100], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x?x100xf16>>{%28} -> tensor<128x?x100xf16>
  %33 = tensor.empty(%29) : tensor<128x?x100x16x1xf16>
  %34 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (128, %29, 100) step (16, 4, 50) shared_outs(%arg3 = %33) -> (tensor<128x?x100x16x1xf16>) {
    %35 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg1)[%29]
    %36 = affine.min affine_map<(d0) -> (-d0 + 128, 16)>(%arg0)
    %37 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
    %38 = affine.min affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>(%35, %arg1)[%28]
    %39 = affine.min affine_map<(d0) -> (-d0 + 100, 50)>(%arg2)
    %extracted_slice = tensor.extract_slice %32[%arg0, %37, %arg2] [%36, %38, %39] [1, 1, 1] : tensor<128x?x100xf16> to tensor<?x?x?xf16>
    %extracted_slice_0 = tensor.extract_slice %arg3[%arg0, %arg1, %arg2, 0, 0] [16, %35, 50, 16, 1] [1, 1, 1, 1, 1] : tensor<128x?x100x16x1xf16> to tensor<16x?x50x16x1xf16>
    %pack = tensor.pack %extracted_slice padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 4, 50], [1, 1, 16]]>} : tensor<?x?x?xf16> -> tensor<16x?x50x16x1xf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %pack into %arg3[%arg0, %arg1, %arg2, 0, 0] [16, %35, 50, 16, 1] [1, 1, 1, 1, 1] : tensor<16x?x50x16x1xf16> into tensor<128x?x100x16x1xf16>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %34, %31, offsets = [0, 0, 0, 0, 0], sizes = [128, %29, 100, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<128x?x100x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<128x?x100x16x1xf16>>{%29}
  return
 }
	func.func @prefill_bs4$async_dispatch_20_pack_f16() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
	%c32_i64 = arith.constant 32 : i64
	%cst = arith.constant 0.000000e+00 : f16
	%0 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
	%4 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
	%5 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
	%6 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
	%7 = hal.interface.constant.load layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
	%8 = arith.extui %0 : i32 to i64
	%9 = arith.extui %1 : i32 to i64
	%10 = arith.shli %9, %c32_i64 : i64
	%11 = arith.ori %8, %10 : i64
	%12 = arith.index_castui %11 : i64 to index
	%13 = arith.extui %2 : i32 to i64
	%14 = arith.extui %3 : i32 to i64
	%15 = arith.shli %14, %c32_i64 : i64
	%16 = arith.ori %13, %15 : i64
	%17 = arith.index_castui %16 : i64 to index
	%18 = arith.extui %4 : i32 to i64
	%19 = arith.extui %5 : i32 to i64
	%20 = arith.shli %19, %c32_i64 : i64
	%21 = arith.ori %18, %20 : i64
	%22 = arith.index_castui %21 : i64 to index
	%23 = arith.extui %6 : i32 to i64
	%24 = arith.extui %7 : i32 to i64
	%25 = arith.shli %24, %c32_i64 : i64
	%26 = arith.ori %23, %25 : i64
	%27 = arith.index_castui %26 : i64 to index
	%28 = flow.dispatch.workload.ordinal %22, 0 : index
	%29 = flow.dispatch.workload.ordinal %27, 1 : index
	%30 = hal.interface.binding.subspan layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%12) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x?x100xf16>>{%28}
	%31 = hal.interface.binding.subspan layout(<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%17) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x?x100x16x1xf16>>{%29}
	%32 = flow.dispatch.tensor.load %30, offsets = [0, 0, 0], sizes = [128, %28, 100], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x?x100xf16>>{%28} -> tensor<128x?x100xf16>
	%33 = tensor.empty(%29) : tensor<128x?x100x16x1xf16>
	%34 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (128, %29, 100) step (16, 4, 50) shared_outs(%arg3 = %33) -> (tensor<128x?x100x16x1xf16>) {
	%35 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg1)[%29]
	%36 = affine.min affine_map<(d0) -> (-d0 + 128, 16)>(%arg0)
	%37 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
	%38 = affine.min affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>(%35, %arg1)[%28]
	%39 = affine.min affine_map<(d0) -> (-d0 + 100, 50)>(%arg2)
	%extracted_slice = tensor.extract_slice %32[%arg0, %37, %arg2] [%36, %38, %39] [1, 1, 1] : tensor<128x?x100xf16> to tensor<?x?x?xf16>
	%extracted_slice_0 = tensor.extract_slice %arg3[%arg0, %arg1, %arg2, 0, 0] [16, %35, 50, 16, 1] [1, 1, 1, 1, 1] : tensor<128x?x100x16x1xf16> to tensor<16x?x50x16x1xf16>
	%pack = tensor.pack %extracted_slice padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[16, 4, 50], [1, 1, 16]]>} : tensor<?x?x?xf16> -> tensor<16x?x50x16x1xf16>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %pack into %arg3[%arg0, %arg1, %arg2, 0, 0] [16, %35, 50, 16, 1] [1, 1, 1, 1, 1] : tensor<16x?x50x16x1xf16> into tensor<128x?x100x16x1xf16>
	}
	} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
	flow.dispatch.tensor.store %34, %31, offsets = [0, 0, 0, 0, 0], sizes = [128, %29, 100, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<128x?x100x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<128x?x100x16x1xf16>>{%29}
	return
	}