Last active
May 14, 2025 16:00
-
-
Save llandsmeer/240f5718350fef6feca57010ba4888b3 to your computer and use it in GitHub Desktop.
StreamHLS with JAX
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -ex | |
. ../env/bin/activate | |
python3 jax_test.py | |
iree-opt ./jax.stablehlo.mlir \ | |
--iree-stablehlo-input-transformation-pipeline \ | |
--convert-scf-to-cf \ | |
> jax.linalg.mlir | |
streamhls-opt ./jax.linalg.mlir \ | |
-streamhls-host-pipeline \ | |
> host.mlir | |
streamhls-translate ./host.mlir \ | |
-emit-vivado-hls \ | |
-vitis-hls-weights-dir=data \ | |
-vitis-hls-is-host=true \ | |
-o host_tb.cpp | |
# put top-func=main | |
streamhls-opt jax.linalg.mlir \ | |
-streamhls-kernel-pipeline="top-func=main \ | |
graph-file=graph\ | |
report-file=report\ | |
optimize-schedule=1\ | |
parallelize-nodes=1\ | |
combined-optimization=0\ | |
board-dsps=1024 \ | |
tiling-limit=16 \ | |
time-limit-minutes=10 \ | |
bufferize-func-args=0 \ | |
optimize-conv-reuse=0 \ | |
minimize-on-chip-buffers=0 \ | |
debug-point=14" > kernel.mlir | |
streamhls-translate \ | |
kernel.mlir \ | |
-emit-vivado-hls \ | |
-o kernel.cpp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from jax import numpy as jnp | |
from jax import jit | |
import jax | |
def f(rs): | |
o = jax.vmap(lambda r: | |
jax.lax.scan(lambda x, _: | |
(jax.lax.select( | |
x > 0.5, | |
-x, | |
r*x*(1-x)), | |
x) | |
, 0.5, length=20, unroll=True)[0] | |
)(rs) | |
return o | |
# return jax.lax.select(x > 3, x - 5, x + 5) | |
r# eturn x + 1 | |
x = jnp.ones((4,)) | |
mlir = jax.jit(f).lower(x).as_text() | |
with open('jax.stablehlo.mlir', 'w') as f: | |
print(mlir.replace('@main(', '@forward('), file=f) | |
print(mlir) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0) -> (d0)> | |
module @jit_f { | |
func.func public @forward(%arg0: tensor<4xf32>) -> tensor<4xf32> { | |
%cst = arith.constant -5.000000e-01 : f32 | |
%false = arith.constant false | |
%cst_0 = arith.constant 1.000000e+00 : f32 | |
%cst_1 = arith.constant 5.000000e-01 : f32 | |
%0 = tensor.empty() : tensor<4xi1> | |
%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<4xi1>) { | |
^bb0(%out: i1): | |
linalg.yield %false : i1 | |
} -> tensor<4xi1> | |
%2 = tensor.empty() : tensor<4xf32> | |
%3 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%2 : tensor<4xf32>) { | |
^bb0(%out: f32): | |
linalg.yield %cst : f32 | |
} -> tensor<4xf32> | |
%4 = tensor.empty() : tensor<4xf32> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<4xf32>) outs(%4 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.mulf %in, %cst_1 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%6 = tensor.empty() : tensor<4xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<4xf32>) outs(%6 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.mulf %in, %cst_1 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%8 = tensor.empty() : tensor<4xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%1, %3, %7 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%8 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%10 = tensor.empty() : tensor<4xi1> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%10 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%12 = tensor.empty() : tensor<4xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%12 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%14 = tensor.empty() : tensor<4xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %9 : tensor<4xf32>, tensor<4xf32>) outs(%14 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%16 = tensor.empty() : tensor<4xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%16 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%18 = tensor.empty() : tensor<4xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%15, %17 : tensor<4xf32>, tensor<4xf32>) outs(%18 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%20 = tensor.empty() : tensor<4xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%11, %13, %19 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%20 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%22 = tensor.empty() : tensor<4xi1> | |
%23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%22 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%24 = tensor.empty() : tensor<4xf32> | |
%25 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%24 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%26 = tensor.empty() : tensor<4xf32> | |
%27 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %21 : tensor<4xf32>, tensor<4xf32>) outs(%26 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%28 = tensor.empty() : tensor<4xf32> | |
%29 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%28 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%30 = tensor.empty() : tensor<4xf32> | |
%31 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%27, %29 : tensor<4xf32>, tensor<4xf32>) outs(%30 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%32 = tensor.empty() : tensor<4xf32> | |
%33 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%23, %25, %31 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%32 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%34 = tensor.empty() : tensor<4xi1> | |
%35 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%34 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%36 = tensor.empty() : tensor<4xf32> | |
%37 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%36 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%38 = tensor.empty() : tensor<4xf32> | |
%39 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %33 : tensor<4xf32>, tensor<4xf32>) outs(%38 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%40 = tensor.empty() : tensor<4xf32> | |
%41 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%40 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%42 = tensor.empty() : tensor<4xf32> | |
%43 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%39, %41 : tensor<4xf32>, tensor<4xf32>) outs(%42 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%44 = tensor.empty() : tensor<4xf32> | |
%45 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%35, %37, %43 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%44 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%46 = tensor.empty() : tensor<4xi1> | |
%47 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%46 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%48 = tensor.empty() : tensor<4xf32> | |
%49 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%48 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%50 = tensor.empty() : tensor<4xf32> | |
%51 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %45 : tensor<4xf32>, tensor<4xf32>) outs(%50 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%52 = tensor.empty() : tensor<4xf32> | |
%53 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%52 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%54 = tensor.empty() : tensor<4xf32> | |
%55 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%51, %53 : tensor<4xf32>, tensor<4xf32>) outs(%54 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%56 = tensor.empty() : tensor<4xf32> | |
%57 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%47, %49, %55 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%56 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%58 = tensor.empty() : tensor<4xi1> | |
%59 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%58 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%60 = tensor.empty() : tensor<4xf32> | |
%61 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%60 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%62 = tensor.empty() : tensor<4xf32> | |
%63 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %57 : tensor<4xf32>, tensor<4xf32>) outs(%62 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%64 = tensor.empty() : tensor<4xf32> | |
%65 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%64 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%66 = tensor.empty() : tensor<4xf32> | |
%67 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%63, %65 : tensor<4xf32>, tensor<4xf32>) outs(%66 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%68 = tensor.empty() : tensor<4xf32> | |
%69 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%59, %61, %67 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%68 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%70 = tensor.empty() : tensor<4xi1> | |
%71 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%70 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%72 = tensor.empty() : tensor<4xf32> | |
%73 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%72 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%74 = tensor.empty() : tensor<4xf32> | |
%75 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %69 : tensor<4xf32>, tensor<4xf32>) outs(%74 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%76 = tensor.empty() : tensor<4xf32> | |
%77 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%76 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%78 = tensor.empty() : tensor<4xf32> | |
%79 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%75, %77 : tensor<4xf32>, tensor<4xf32>) outs(%78 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%80 = tensor.empty() : tensor<4xf32> | |
%81 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%71, %73, %79 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%80 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%82 = tensor.empty() : tensor<4xi1> | |
%83 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%82 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%84 = tensor.empty() : tensor<4xf32> | |
%85 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%84 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%86 = tensor.empty() : tensor<4xf32> | |
%87 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %81 : tensor<4xf32>, tensor<4xf32>) outs(%86 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%88 = tensor.empty() : tensor<4xf32> | |
%89 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%88 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%90 = tensor.empty() : tensor<4xf32> | |
%91 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%87, %89 : tensor<4xf32>, tensor<4xf32>) outs(%90 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%92 = tensor.empty() : tensor<4xf32> | |
%93 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%83, %85, %91 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%92 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%94 = tensor.empty() : tensor<4xi1> | |
%95 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%94 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%96 = tensor.empty() : tensor<4xf32> | |
%97 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%96 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%98 = tensor.empty() : tensor<4xf32> | |
%99 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %93 : tensor<4xf32>, tensor<4xf32>) outs(%98 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%100 = tensor.empty() : tensor<4xf32> | |
%101 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%100 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%102 = tensor.empty() : tensor<4xf32> | |
%103 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%99, %101 : tensor<4xf32>, tensor<4xf32>) outs(%102 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%104 = tensor.empty() : tensor<4xf32> | |
%105 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%95, %97, %103 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%104 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%106 = tensor.empty() : tensor<4xi1> | |
%107 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%106 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%108 = tensor.empty() : tensor<4xf32> | |
%109 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%108 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%110 = tensor.empty() : tensor<4xf32> | |
%111 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %105 : tensor<4xf32>, tensor<4xf32>) outs(%110 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%112 = tensor.empty() : tensor<4xf32> | |
%113 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%112 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%114 = tensor.empty() : tensor<4xf32> | |
%115 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%111, %113 : tensor<4xf32>, tensor<4xf32>) outs(%114 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%116 = tensor.empty() : tensor<4xf32> | |
%117 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%107, %109, %115 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%116 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%118 = tensor.empty() : tensor<4xi1> | |
%119 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%118 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%120 = tensor.empty() : tensor<4xf32> | |
%121 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%120 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%122 = tensor.empty() : tensor<4xf32> | |
%123 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %117 : tensor<4xf32>, tensor<4xf32>) outs(%122 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%124 = tensor.empty() : tensor<4xf32> | |
%125 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%124 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%126 = tensor.empty() : tensor<4xf32> | |
%127 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%123, %125 : tensor<4xf32>, tensor<4xf32>) outs(%126 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%128 = tensor.empty() : tensor<4xf32> | |
%129 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%119, %121, %127 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%128 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%130 = tensor.empty() : tensor<4xi1> | |
%131 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%130 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%132 = tensor.empty() : tensor<4xf32> | |
%133 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%132 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%134 = tensor.empty() : tensor<4xf32> | |
%135 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %129 : tensor<4xf32>, tensor<4xf32>) outs(%134 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%136 = tensor.empty() : tensor<4xf32> | |
%137 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%136 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%138 = tensor.empty() : tensor<4xf32> | |
%139 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%135, %137 : tensor<4xf32>, tensor<4xf32>) outs(%138 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%140 = tensor.empty() : tensor<4xf32> | |
%141 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%131, %133, %139 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%140 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%142 = tensor.empty() : tensor<4xi1> | |
%143 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%142 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%144 = tensor.empty() : tensor<4xf32> | |
%145 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%144 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%146 = tensor.empty() : tensor<4xf32> | |
%147 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %141 : tensor<4xf32>, tensor<4xf32>) outs(%146 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%148 = tensor.empty() : tensor<4xf32> | |
%149 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%148 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%150 = tensor.empty() : tensor<4xf32> | |
%151 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%147, %149 : tensor<4xf32>, tensor<4xf32>) outs(%150 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%152 = tensor.empty() : tensor<4xf32> | |
%153 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%143, %145, %151 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%152 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%154 = tensor.empty() : tensor<4xi1> | |
%155 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%154 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%156 = tensor.empty() : tensor<4xf32> | |
%157 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%156 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%158 = tensor.empty() : tensor<4xf32> | |
%159 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %153 : tensor<4xf32>, tensor<4xf32>) outs(%158 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%160 = tensor.empty() : tensor<4xf32> | |
%161 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%160 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%162 = tensor.empty() : tensor<4xf32> | |
%163 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%159, %161 : tensor<4xf32>, tensor<4xf32>) outs(%162 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%164 = tensor.empty() : tensor<4xf32> | |
%165 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%155, %157, %163 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%164 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%166 = tensor.empty() : tensor<4xi1> | |
%167 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%166 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%168 = tensor.empty() : tensor<4xf32> | |
%169 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%168 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%170 = tensor.empty() : tensor<4xf32> | |
%171 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %165 : tensor<4xf32>, tensor<4xf32>) outs(%170 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%172 = tensor.empty() : tensor<4xf32> | |
%173 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%172 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%174 = tensor.empty() : tensor<4xf32> | |
%175 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%171, %173 : tensor<4xf32>, tensor<4xf32>) outs(%174 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%176 = tensor.empty() : tensor<4xf32> | |
%177 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%167, %169, %175 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%176 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%178 = tensor.empty() : tensor<4xi1> | |
%179 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%178 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%180 = tensor.empty() : tensor<4xf32> | |
%181 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%180 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%182 = tensor.empty() : tensor<4xf32> | |
%183 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %177 : tensor<4xf32>, tensor<4xf32>) outs(%182 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%184 = tensor.empty() : tensor<4xf32> | |
%185 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%184 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%186 = tensor.empty() : tensor<4xf32> | |
%187 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%183, %185 : tensor<4xf32>, tensor<4xf32>) outs(%186 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%188 = tensor.empty() : tensor<4xf32> | |
%189 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%179, %181, %187 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%188 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%190 = tensor.empty() : tensor<4xi1> | |
%191 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%190 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%192 = tensor.empty() : tensor<4xf32> | |
%193 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%192 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%194 = tensor.empty() : tensor<4xf32> | |
%195 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %189 : tensor<4xf32>, tensor<4xf32>) outs(%194 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%196 = tensor.empty() : tensor<4xf32> | |
%197 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%196 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%198 = tensor.empty() : tensor<4xf32> | |
%199 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%195, %197 : tensor<4xf32>, tensor<4xf32>) outs(%198 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%200 = tensor.empty() : tensor<4xf32> | |
%201 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%191, %193, %199 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%200 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%202 = tensor.empty() : tensor<4xi1> | |
%203 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%202 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%204 = tensor.empty() : tensor<4xf32> | |
%205 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%204 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%206 = tensor.empty() : tensor<4xf32> | |
%207 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %201 : tensor<4xf32>, tensor<4xf32>) outs(%206 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%208 = tensor.empty() : tensor<4xf32> | |
%209 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%208 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%210 = tensor.empty() : tensor<4xf32> | |
%211 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%207, %209 : tensor<4xf32>, tensor<4xf32>) outs(%210 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%212 = tensor.empty() : tensor<4xf32> | |
%213 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%203, %205, %211 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%212 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%214 = tensor.empty() : tensor<4xi1> | |
%215 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%214 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%216 = tensor.empty() : tensor<4xf32> | |
%217 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%216 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%218 = tensor.empty() : tensor<4xf32> | |
%219 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %213 : tensor<4xf32>, tensor<4xf32>) outs(%218 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%220 = tensor.empty() : tensor<4xf32> | |
%221 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%220 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%222 = tensor.empty() : tensor<4xf32> | |
%223 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%219, %221 : tensor<4xf32>, tensor<4xf32>) outs(%222 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%224 = tensor.empty() : tensor<4xf32> | |
%225 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%215, %217, %223 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%224 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%226 = tensor.empty() : tensor<4xi1> | |
%227 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%226 : tensor<4xi1>) { | |
^bb0(%in: f32, %out: i1): | |
%238 = arith.cmpf ogt, %in, %cst_1 : f32 | |
linalg.yield %238 : i1 | |
} -> tensor<4xi1> | |
%228 = tensor.empty() : tensor<4xf32> | |
%229 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%228 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.negf %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%230 = tensor.empty() : tensor<4xf32> | |
%231 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %225 : tensor<4xf32>, tensor<4xf32>) outs(%230 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%232 = tensor.empty() : tensor<4xf32> | |
%233 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%232 : tensor<4xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%238 = arith.subf %cst_0, %in : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%234 = tensor.empty() : tensor<4xf32> | |
%235 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%231, %233 : tensor<4xf32>, tensor<4xf32>) outs(%234 : tensor<4xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%238 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
%236 = tensor.empty() : tensor<4xf32> | |
%237 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%227, %229, %235 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%236 : tensor<4xf32>) { | |
^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32): | |
%238 = arith.select %in, %in_2, %in_3 : f32 | |
linalg.yield %238 : f32 | |
} -> tensor<4xf32> | |
return %237 : tensor<4xf32> | |
} | |
} | |
Author
llandsmeer
commented
May 14, 2025
Hi @llandsmeer,
Thank you for your interest and for pointing this out. I added support for any function name, while keeping the default as forward. However, if the kernel name is "main," then there will be conflict with the host code which has the main function. So, the input linalg.mlir top function can have any name except "main" for the flows to work.
#map = affine_map<(d0) -> (d0)>
module @jit_f {
// renamed main -> kernel
func.func public @kernel(%arg0: tensor<4xf32>) -> tensor<4xf32> {
%cst = arith.constant 1.000000e+00 : f32
%0 = tensor.empty() : tensor<4xf32>
%1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<4xf32>) outs(%0 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%2 = arith.addf %in, %cst : f32
linalg.yield %2 : f32
} -> tensor<4xf32>
return %1 : tensor<4xf32>
}
}
run.sh
kernel_name=kernel
streamhls-opt ./jax.linalg.mlir \
-streamhls-host-pipeline="top-func=$kernel_name" \ # updated
> host.mlir
streamhls-translate ./host.mlir \
-emit-vivado-hls \
-vitis-hls-weights-dir=data \
-vitis-hls-is-host=true \
-o host_tb.cpp
streamhls-opt jax.linalg.mlir \
-streamhls-kernel-pipeline="top-func=$kernel_name \ # updated
graph-file=graph\
report-file=report\
optimize-schedule=1\
parallelize-nodes=1\
combined-optimization=0\
board-dsps=1024 \
tiling-limit=16 \
time-limit-minutes=10 \
bufferize-func-args=0 \
optimize-conv-reuse=0 \
minimize-on-chip-buffers=0 \
debug-point=14" > kernel.mlir
streamhls-translate \
kernel.mlir \
-emit-vivado-hls \
-vitis-hls-top-func=$kernel_name \ # updated
-o kernel.cpp
expected output:
Permutation DesignSpaceSize: 1
Parallelization DesignSpaceSize: 9
Total DesignSpaceSize: 9
Permutation solver: latency: 3
Permutation DesignSpaceSize: 1
Parallelization DesignSpaceSize: 9
Total DesignSpaceSize: 9
Parallelization solver: Parallel Latency: 3
Total DSPs: 2
Please let me know if you face any other issues.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment