pashu123 · August 19, 2024 10:56
diff --git a/bug.mlir b/bug.mlir
 #map = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 module {
  util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x3200xf32>{%0, %1}
    %3 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<8640x3200xf16>
    %4 = flow.dispatch.region -> (tensor<?x8640x3200xf16>{%0}) {
      %9 = tensor.empty(%0) : tensor<?x8640x3200xf16>
      %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<8640x3200xf16>) outs(%9 : tensor<?x8640x3200xf16>) {
      ^bb0(%in: f16, %out: f16):
        linalg.yield %in : f16
      } -> tensor<?x8640x3200xf16>
      flow.return %10 : tensor<?x8640x3200xf16>
    }
    %5 = iree_encoding.set_encoding %4 : tensor<?x8640x3200xf16> -> tensor<?x8640x3200xf16, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x8640x3200xf16>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
    %6 = iree_encoding.set_encoding %2 : tensor<?x?x3200xf32> -> tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
    %7 = flow.dispatch.region -> (tensor<?x?x8640xf32>{%0, %1}) {
      %9 = tensor.empty(%0, %1) : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
      %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
      %11 = linalg.batch_matmul_transpose_b ins(%6, %5 : tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>, tensor<?x8640x3200xf16, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x8640x3200xf16>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) outs(%10 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
      %12 = iree_encoding.unset_encoding %11 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<?x?x8640xf32>
      %extracted_slice = tensor.extract_slice %12[0, 0, 0] [%0, %1, 8640] [1, 1, 1] : tensor<?x?x8640xf32> to tensor<?x?x8640xf32>
      flow.return %extracted_slice : tensor<?x?x8640xf32>
    }
    %8 = hal.tensor.export %7 "output0" : tensor<?x?x8640xf32>{%0, %1} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }
	#map = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
	#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
	module {
	util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
	%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
	%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x3200xf32>{%0, %1}
	%3 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<8640x3200xf16>
	%4 = flow.dispatch.region -> (tensor<?x8640x3200xf16>{%0}) {
	%9 = tensor.empty(%0) : tensor<?x8640x3200xf16>
	%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<8640x3200xf16>) outs(%9 : tensor<?x8640x3200xf16>) {
	^bb0(%in: f16, %out: f16):
	linalg.yield %in : f16
	} -> tensor<?x8640x3200xf16>
	flow.return %10 : tensor<?x8640x3200xf16>
	}
	%5 = iree_encoding.set_encoding %4 : tensor<?x8640x3200xf16> -> tensor<?x8640x3200xf16, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x8640x3200xf16>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
	%6 = iree_encoding.set_encoding %2 : tensor<?x?x3200xf32> -> tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
	%7 = flow.dispatch.region -> (tensor<?x?x8640xf32>{%0, %1}) {
	%9 = tensor.empty(%0, %1) : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
	%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
	%11 = linalg.batch_matmul_transpose_b ins(%6, %5 : tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>, tensor<?x8640x3200xf16, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x8640x3200xf16>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) outs(%10 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>>
	%12 = iree_encoding.unset_encoding %11 : tensor<?x?x8640xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x8640xf32>, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<?x?x8640xf32>
	%extracted_slice = tensor.extract_slice %12[0, 0, 0] [%0, %1, 8640] [1, 1, 1] : tensor<?x?x8640xf32> to tensor<?x?x8640xf32>
	flow.return %extracted_slice : tensor<?x?x8640xf32>
	}
	%8 = hal.tensor.export %7 "output0" : tensor<?x?x8640xf32>{%0, %1} -> !hal.buffer_view
	util.return %8 : !hal.buffer_view
	}
	}