pashu123 · June 28, 2024 22:11
diff --git a/ir.mlir b/ir.mlir
 #matmul_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+avx512f", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load[0] : i32
  %1 = hal.interface.constant.load[1] : i32
  %2 = hal.interface.constant.load[2] : i32
  %3 = hal.interface.constant.load[3] : i32
  %4 = hal.interface.constant.load[4] : i32
  %5 = hal.interface.constant.load[5] : i32
  %6 = hal.interface.constant.load[6] : i32
  %7 = hal.interface.constant.load[7] : i32
  %8 = hal.interface.constant.load[8] : i32
  %9 = hal.interface.constant.load[9] : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35 = flow.dispatch.workload.ordinal %14, 0 : index
  %36 = flow.dispatch.workload.ordinal %19, 1 : index
  %37 = flow.dispatch.workload.ordinal %24, 2 : index
  %38 = flow.dispatch.workload.ordinal %29, 3 : index
  %39 = flow.dispatch.workload.ordinal %34, 4 : index
  %40 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35}
  %41 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37}
  %43 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
  %44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0], sizes = [%38, %35, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35} -> tensor<?x?x16x1xf32>
  %45 = flow.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%37, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37} -> tensor<?x16xf32>
  %47 = tensor.empty(%38, %39) : tensor<?x?x16x16xf32>
  %48 = linalg.fill ins(%cst : f32) outs(%47 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %49 = linalg.mmt4d {lowering_config = #matmul_config} ins(%44, %45 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                                         affine_map<(d0, d1, d2, d3) -> (d0, d2)>,
                                         affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
    ins(%49, %46 : tensor<?x?x16x16xf32>, tensor<?x16xf32>)
    outs(%47 : tensor<?x?x16x16xf32>)
    {
  ^bb0(%in: f32, %in_0: f32, %out: f32):
    %51 = arith.addf %in, %in_0 : f32
    %52 = arith.maximumf %51, %cst : f32
    linalg.yield %52 : f32
  } -> tensor<?x?x16x16xf32>
  flow.dispatch.tensor.store %50, %43, offsets = [0, 0, 0, 0], sizes = [%38, %39, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
  return
 }
	#matmul_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
	#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+avx512f", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load[0] : i32
	%1 = hal.interface.constant.load[1] : i32
	%2 = hal.interface.constant.load[2] : i32
	%3 = hal.interface.constant.load[3] : i32
	%4 = hal.interface.constant.load[4] : i32
	%5 = hal.interface.constant.load[5] : i32
	%6 = hal.interface.constant.load[6] : i32
	%7 = hal.interface.constant.load[7] : i32
	%8 = hal.interface.constant.load[8] : i32
	%9 = hal.interface.constant.load[9] : i32
	%10 = arith.extui %0 : i32 to i64
	%11 = arith.extui %1 : i32 to i64
	%12 = arith.shli %11, %c32_i64 : i64
	%13 = arith.ori %10, %12 : i64
	%14 = arith.index_castui %13 : i64 to index
	%15 = arith.extui %2 : i32 to i64
	%16 = arith.extui %3 : i32 to i64
	%17 = arith.shli %16, %c32_i64 : i64
	%18 = arith.ori %15, %17 : i64
	%19 = arith.index_castui %18 : i64 to index
	%20 = arith.extui %4 : i32 to i64
	%21 = arith.extui %5 : i32 to i64
	%22 = arith.shli %21, %c32_i64 : i64
	%23 = arith.ori %20, %22 : i64
	%24 = arith.index_castui %23 : i64 to index
	%25 = arith.extui %6 : i32 to i64
	%26 = arith.extui %7 : i32 to i64
	%27 = arith.shli %26, %c32_i64 : i64
	%28 = arith.ori %25, %27 : i64
	%29 = arith.index_castui %28 : i64 to index
	%30 = arith.extui %8 : i32 to i64
	%31 = arith.extui %9 : i32 to i64
	%32 = arith.shli %31, %c32_i64 : i64
	%33 = arith.ori %30, %32 : i64
	%34 = arith.index_castui %33 : i64 to index
	%35 = flow.dispatch.workload.ordinal %14, 0 : index
	%36 = flow.dispatch.workload.ordinal %19, 1 : index
	%37 = flow.dispatch.workload.ordinal %24, 2 : index
	%38 = flow.dispatch.workload.ordinal %29, 3 : index
	%39 = flow.dispatch.workload.ordinal %34, 4 : index
	%40 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35}
	%41 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
	%42 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37}
	%43 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
	%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0], sizes = [%38, %35, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35} -> tensor<?x?x16x1xf32>
	%45 = flow.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
	%46 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%37, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37} -> tensor<?x16xf32>
	%47 = tensor.empty(%38, %39) : tensor<?x?x16x16xf32>
	%48 = linalg.fill ins(%cst : f32) outs(%47 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
	%49 = linalg.mmt4d {lowering_config = #matmul_config} ins(%44, %45 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
	%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d2)>,
	affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
	iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
	ins(%49, %46 : tensor<?x?x16x16xf32>, tensor<?x16xf32>)
	outs(%47 : tensor<?x?x16x16xf32>)
	{
	^bb0(%in: f32, %in_0: f32, %out: f32):
	%51 = arith.addf %in, %in_0 : f32
	%52 = arith.maximumf %51, %cst : f32
	linalg.yield %52 : f32
	} -> tensor<?x?x16x16xf32>
	flow.dispatch.tensor.store %50, %43, offsets = [0, 0, 0, 0], sizes = [%38, %39, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
	return
	}