pashu123 · August 19, 2024 16:46
diff --git a/2ndbug.mlir b/2ndbug.mlir
 func.func @matmul_broad_dispatch_1_set_encoding_LHS_DxDx3200() {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
  %4 = arith.extui %0 : i32 to i64
  %5 = arith.extui %1 : i32 to i64
  %6 = arith.shli %5, %c32_i64 : i64
  %7 = arith.ori %4, %6 : i64
  %8 = arith.index_castui %7 : i64 to index
  %9 = arith.extui %2 : i32 to i64
  %10 = arith.extui %3 : i32 to i64
  %11 = arith.shli %10, %c32_i64 : i64
  %12 = arith.ori %9, %11 : i64
  %13 = arith.index_castui %12 : i64 to index
  %14 = flow.dispatch.workload.ordinal %8, 0 : index
  %15 = flow.dispatch.workload.ordinal %13, 1 : index
  %16 = hal.interface.binding.subspan layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15}
  %17 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [%14, %15, 3200], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15} -> tensor<?x?x3200xf32>
  %18 = iree_encoding.set_encoding %17 : tensor<?x?x3200xf32> -> tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], round_dims_to = array<i64: 32, 32, 32>>>
  flow.dispatch.tensor.store %18, %16, offsets = [0, 0, 0], sizes = [%14, %15, 3200], strides = [1, 1, 1] : tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], round_dims_to = array<i64: 32, 32, 32>>> -> !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15}
  return
 }
	func.func @matmul_broad_dispatch_1_set_encoding_LHS_DxDx3200() {
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%0 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(3) : i32
	%4 = arith.extui %0 : i32 to i64
	%5 = arith.extui %1 : i32 to i64
	%6 = arith.shli %5, %c32_i64 : i64
	%7 = arith.ori %4, %6 : i64
	%8 = arith.index_castui %7 : i64 to index
	%9 = arith.extui %2 : i32 to i64
	%10 = arith.extui %3 : i32 to i64
	%11 = arith.shli %10, %c32_i64 : i64
	%12 = arith.ori %9, %11 : i64
	%13 = arith.index_castui %12 : i64 to index
	%14 = flow.dispatch.workload.ordinal %8, 0 : index
	%15 = flow.dispatch.workload.ordinal %13, 1 : index
	%16 = hal.interface.binding.subspan layout(<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15}
	%17 = flow.dispatch.tensor.load %16, offsets = [0, 0, 0], sizes = [%14, %15, 3200], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15} -> tensor<?x?x3200xf32>
	%18 = iree_encoding.set_encoding %17 : tensor<?x?x3200xf32> -> tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], round_dims_to = array<i64: 32, 32, 32>>>
	flow.dispatch.tensor.store %18, %16, offsets = [0, 0, 0], sizes = [%14, %15, 3200], strides = [1, 1, 1] : tensor<?x?x3200xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f16, f32], original_type = tensor<?x?x3200xf32>, user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], round_dims_to = array<i64: 32, 32, 32>>> -> !flow.dispatch.tensor<readwrite:tensor<?x?x3200xf32>>{%14, %15}
	return
	}