Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save AmosLewis/d171b35532b67a44a0a5d0a8a20504d1 to your computer and use it in GitHub Desktop.
Save AmosLewis/d171b35532b67a44a0a5d0a8a20504d1 to your computer and use it in GitHub Desktop.
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "haswell", cpu_features = "-prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-xsaves,-avx512fp16,-usermsr,-sm4,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,-xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,-adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,-avx512vl,-uintr,-clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,-rdpru,-clwb,+mmx,+sse2,-rdseed,-avx512vbmi2,-prefetchi,-rdpid,-fma4,-avx512vbmi,-shstk,-vaes,-waitpkg,-sgx,+fxsr,-avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "--iree-llvmcpu-stack-allocation-limit=256000"}>
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d0)>
#map2 = affine_map<(d0, d1) -> (d1)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<() -> ()>
#map5 = affine_map<(d0, d1) -> ()>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.consteval} {
flow.executable private @jit_eval_dispatch_0 {
flow.executable.export public @jit_eval_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
%cst = arith.constant 2.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f16
%cst_1 = arith.constant 6.400000e+01 : f16
%0 = tensor.empty() : tensor<32xf16>
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32xf16>) {
^bb0(%out: f16):
%2 = linalg.index 0 : index
%3 = arith.index_cast %2 : index to i64
%4 = arith.sitofp %3 : i64 to f16
%5 = arith.mulf %4, %cst : f16
%6 = arith.addf %5, %cst_0 : f16
%7 = arith.divf %6, %cst_1 : f16
linalg.yield %7 : f16
} -> tensor<32xf16>
flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
return
}
}
}
func.func @jit_eval() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval() -> (%output0: tensor<32xf16>)"}} {
%0 = flow.dispatch @jit_eval_dispatch_0::@jit_eval_dispatch_0_generic_32_f16() : () -> tensor<32xf16>
%1 = hal.tensor.export %0 "output0" : tensor<32xf16> -> !hal.buffer_view
return %1 : !hal.buffer_view
}
flow.executable private @jit_eval_0_dispatch_0 {
flow.executable.export public @jit_eval_0_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_0_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
%cst = arith.constant 1.000000e+04 : f16
%cst_0 = arith.constant 0.000000e+00 : f16
%cst_1 = arith.constant 1.000000e+00 : f16
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
%1 = tensor.empty() : tensor<32xf16>
%2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<32xf16>) outs(%1 : tensor<32xf16>) {
^bb0(%in: f16, %out: f16):
%3 = math.powf %cst, %in : f16
%4 = arith.cmpf one, %3, %cst_0 : f16
cf.assert %4, "unimplemented: tensor with zero element"
%5 = arith.divf %cst_1, %3 : f16
linalg.yield %5 : f16
} -> tensor<32xf16>
flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
return
}
}
}
func.func @jit_eval_0(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<32xf16>) -> (%output0: tensor<32xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xf16>
%1 = flow.dispatch @jit_eval_0_dispatch_0::@jit_eval_0_dispatch_0_generic_32_f16(%0) : (tensor<32xf16>) -> tensor<32xf16>
%2 = hal.tensor.export %1 "output0" : tensor<32xf16> -> !hal.buffer_view
return %2 : !hal.buffer_view
}
flow.executable private @jit_eval_1_dispatch_0 {
flow.executable.export public @jit_eval_1_dispatch_0_generic_32768_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_1_dispatch_0_generic_32768_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32768xf16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<32768xf16>
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32768xf16>) {
^bb0(%out: f16):
%2 = linalg.index 0 : index
%3 = arith.index_cast %2 : index to i64
%4 = arith.sitofp %3 : i64 to f16
%5 = arith.addf %4, %cst : f16
linalg.yield %5 : f16
} -> tensor<32768xf16>
flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32768], strides = [1] : tensor<32768xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768xf16>>
return
}
}
}
func.func @jit_eval_1() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1() -> (%output0: tensor<32768xf16>)"}} {
%0 = flow.dispatch @jit_eval_1_dispatch_0::@jit_eval_1_dispatch_0_generic_32768_f16() : () -> tensor<32768xf16>
%1 = hal.tensor.export %0 "output0" : tensor<32768xf16> -> !hal.buffer_view
return %1 : !hal.buffer_view
}
flow.executable private @jit_eval_2_dispatch_0 {
flow.executable.export public @jit_eval_2_dispatch_0_generic_32768x32_f16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_2_dispatch_0_generic_32768x32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32768xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32768xf16>> -> tensor<32768xf16>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
%2 = tensor.empty() : tensor<32768x32xf16>
%3:2 = linalg.generic {indexing_maps = [#map1, #map2, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<32768xf16>, tensor<32xf16>) outs(%2, %2 : tensor<32768x32xf16>, tensor<32768x32xf16>) {
^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16):
%4 = arith.mulf %in, %in_0 : f16
%5 = math.sin %4 : f16
%6 = math.cos %4 : f16
linalg.yield %5, %6 : f16, f16
} -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
return
}
}
}
flow.executable private @jit_eval_2_dispatch_1 {
flow.executable.export public @jit_eval_2_dispatch_1_slow_memcpy workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_2_dispatch_1_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 0], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
return
}
}
}
flow.executable private @jit_eval_2_dispatch_2 {
flow.executable.export public @jit_eval_2_dispatch_2_slow_memcpy workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_2_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 1], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
return
}
}
}
func.func @jit_eval_2(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<32768xf16>, %input1: tensor<32xf16>) -> (%output0: tensor<4x32x2xf16>)"}} {
%c2 = arith.constant 2 : index
%c32 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32768xf16>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32xf16>
%2:2 = flow.dispatch @jit_eval_2_dispatch_0::@jit_eval_2_dispatch_0_generic_32768x32_f16(%0, %1) : (tensor<32768xf16>, tensor<32xf16>) -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
%3 = flow.tensor.empty : tensor<32768x32x2xf16>
%4 = flow.dispatch @jit_eval_2_dispatch_1::@jit_eval_2_dispatch_1_slow_memcpy(%2#1, %3) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %3
%5 = flow.dispatch @jit_eval_2_dispatch_2::@jit_eval_2_dispatch_2_slow_memcpy(%2#0, %4) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %4
%6 = flow.tensor.reshape %5 : tensor<32768x32x2xf16> -> tensor<1x32768x32x2xf16>
%7 = flow.tensor.slice %6[%c0, %c0, %c0, %c0 for %c1, %c4, %c32, %c2] : tensor<1x32768x32x2xf16> -> tensor<1x4x32x2xf16>
%8 = flow.tensor.reshape %7 : tensor<1x4x32x2xf16> -> tensor<4x32x2xf16>
%9 = hal.tensor.export %8 "output0" : tensor<4x32x2xf16> -> !hal.buffer_view
return %9 : !hal.buffer_view
}
flow.executable private @jit_eval_4_dispatch_0 {
flow.executable.export public @jit_eval_4_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_4_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
return
}
}
}
func.func @jit_eval_4(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
%1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
%2 = flow.dispatch @jit_eval_4_dispatch_0::@jit_eval_4_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
%3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
flow.executable private @jit_eval_5_dispatch_0 {
flow.executable.export public @jit_eval_5_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_5_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 1], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
return
}
}
}
func.func @jit_eval_5(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
%1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
%2 = flow.dispatch @jit_eval_5_dispatch_0::@jit_eval_5_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
%3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
flow.executable private @jit_eval_6_dispatch_0 {
flow.executable.export public @jit_eval_6_dispatch_0_transpose workgroups() -> (index, index, index) {
%c1 = arith.constant 1 : index
flow.return %c1, %c1, %c1 : index, index, index
}
builtin.module {
func.func @jit_eval_6_dispatch_0_transpose(%arg0: !flow.dispatch.tensor<writeonly:tensor<f16>>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<f16>
%1 = linalg.generic {indexing_maps = [#map4], iterator_types = []} outs(%0 : tensor<f16>) {
^bb0(%out: f16):
linalg.yield %cst : f16
} -> tensor<f16>
flow.dispatch.tensor.store %1, %arg0, offsets = [], sizes = [], strides = [] : tensor<f16> -> !flow.dispatch.tensor<writeonly:tensor<f16>>
return
}
}
}
func.func @jit_eval_6() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6() -> (%output0: tensor<f16>)"}} {
%0 = flow.dispatch @jit_eval_6_dispatch_0::@jit_eval_6_dispatch_0_transpose() : () -> tensor<f16>
%1 = hal.tensor.export %0 "output0" : tensor<f16> -> !hal.buffer_view
return %1 : !hal.buffer_view
}
func.func @jit_eval_7() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7() -> (%output0: tensor<f32>)"}} {
%cst = arith.constant 0xFF800000 : f32
%0 = flow.tensor.splat %cst : tensor<f32>
%1 = hal.tensor.export %0 "output0" : tensor<f32> -> !hal.buffer_view
return %1 : !hal.buffer_view
}
flow.executable private @jit_eval_8_dispatch_0 {
flow.executable.export public @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%arg0: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<f16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>) {
%false = arith.constant false
%0 = flow.dispatch.tensor.load %arg0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f16>> -> tensor<f16>
%2 = tensor.empty() : tensor<4x4xf16>
%3 = linalg.generic {indexing_maps = [#map5, #map5, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<f32>, tensor<f16>) outs(%2 : tensor<4x4xf16>) {
^bb0(%in: f32, %in_0: f16, %out: f16):
%4 = arith.truncf %in : f32 to f16
%5 = linalg.index 0 : index
%6 = linalg.index 1 : index
%7 = arith.index_cast %5 : index to i64
%8 = arith.index_cast %6 : index to i64
%9 = arith.cmpi sle, %8, %7 : i64
%10 = arith.cmpi eq, %9, %false : i1
%11 = arith.select %10, %4, %in_0 : f16
linalg.yield %11 : f16
} -> tensor<4x4xf16>
flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : tensor<4x4xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>
return
}
}
}
func.func @jit_eval_8(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_8(%input0: tensor<f32>, %input1: tensor<f16>) -> (%output0: tensor<4x4xf16>)"}} {
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<f32>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<f16>
%2 = flow.dispatch @jit_eval_8_dispatch_0::@jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%0, %1) : (tensor<f32>, tensor<f16>) -> tensor<4x4xf16>
%3 = hal.tensor.export %2 "output0" : tensor<4x4xf16> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment