Created
June 28, 2024 22:11
-
-
Save pashu123/4607c60303a472ab0fbb0bd7829338ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#matmul_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]> | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+avx512f", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { | |
%c0 = arith.constant 0 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load[0] : i32 | |
%1 = hal.interface.constant.load[1] : i32 | |
%2 = hal.interface.constant.load[2] : i32 | |
%3 = hal.interface.constant.load[3] : i32 | |
%4 = hal.interface.constant.load[4] : i32 | |
%5 = hal.interface.constant.load[5] : i32 | |
%6 = hal.interface.constant.load[6] : i32 | |
%7 = hal.interface.constant.load[7] : i32 | |
%8 = hal.interface.constant.load[8] : i32 | |
%9 = hal.interface.constant.load[9] : i32 | |
%10 = arith.extui %0 : i32 to i64 | |
%11 = arith.extui %1 : i32 to i64 | |
%12 = arith.shli %11, %c32_i64 : i64 | |
%13 = arith.ori %10, %12 : i64 | |
%14 = arith.index_castui %13 : i64 to index | |
%15 = arith.extui %2 : i32 to i64 | |
%16 = arith.extui %3 : i32 to i64 | |
%17 = arith.shli %16, %c32_i64 : i64 | |
%18 = arith.ori %15, %17 : i64 | |
%19 = arith.index_castui %18 : i64 to index | |
%20 = arith.extui %4 : i32 to i64 | |
%21 = arith.extui %5 : i32 to i64 | |
%22 = arith.shli %21, %c32_i64 : i64 | |
%23 = arith.ori %20, %22 : i64 | |
%24 = arith.index_castui %23 : i64 to index | |
%25 = arith.extui %6 : i32 to i64 | |
%26 = arith.extui %7 : i32 to i64 | |
%27 = arith.shli %26, %c32_i64 : i64 | |
%28 = arith.ori %25, %27 : i64 | |
%29 = arith.index_castui %28 : i64 to index | |
%30 = arith.extui %8 : i32 to i64 | |
%31 = arith.extui %9 : i32 to i64 | |
%32 = arith.shli %31, %c32_i64 : i64 | |
%33 = arith.ori %30, %32 : i64 | |
%34 = arith.index_castui %33 : i64 to index | |
%35 = flow.dispatch.workload.ordinal %14, 0 : index | |
%36 = flow.dispatch.workload.ordinal %19, 1 : index | |
%37 = flow.dispatch.workload.ordinal %24, 2 : index | |
%38 = flow.dispatch.workload.ordinal %29, 3 : index | |
%39 = flow.dispatch.workload.ordinal %34, 4 : index | |
%40 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35} | |
%41 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} | |
%42 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37} | |
%43 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39} | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0], sizes = [%38, %35, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35} -> tensor<?x?x16x1xf32> | |
%45 = flow.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32> | |
%46 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%37, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x16xf32>>{%37} -> tensor<?x16xf32> | |
%47 = tensor.empty(%38, %39) : tensor<?x?x16x16xf32> | |
%48 = linalg.fill ins(%cst : f32) outs(%47 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> | |
%49 = linalg.mmt4d {lowering_config = #matmul_config} ins(%44, %45 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, | |
affine_map<(d0, d1, d2, d3) -> (d0, d2)>, | |
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], | |
iterator_types = ["parallel", "parallel", "parallel", "parallel"]} | |
ins(%49, %46 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) | |
outs(%47 : tensor<?x?x16x16xf32>) | |
{ | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%51 = arith.addf %in, %in_0 : f32 | |
%52 = arith.maximumf %51, %cst : f32 | |
linalg.yield %52 : f32 | |
} -> tensor<?x?x16x16xf32> | |
flow.dispatch.tensor.store %50, %43, offsets = [0, 0, 0, 0], sizes = [%38, %39, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment