Created
February 5, 2025 14:48
-
-
Save pashu123/73b85145aef8c043e1acaeeb7873221b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @main$async_dispatch_175 { | |
| hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
| hal.executable.export public @main$async_dispatch_175_elementwise_2x1024x5120_f16xf16xf16xf32xi8 ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
| ^bb0(%arg0: !hal.device): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @main$async_dispatch_175_elementwise_2x1024x5120_f16xf16xf16xf32xi8() { | |
| %cst = arith.constant 1.270000e+02 : f16 | |
| %cst_0 = arith.constant -1.280000e+02 : f16 | |
| %cst_1 = arith.constant 5.000000e-01 : f16 | |
| %cst_2 = arith.constant 1.000000e+00 : f16 | |
| %cst_3 = arith.constant 2.000000e+00 : f16 | |
| %0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
| %1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
| %2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
| %3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
| %4 = arith.index_castui %0 : i32 to index | |
| %5 = arith.index_castui %1 : i32 to index | |
| %6 = arith.index_castui %2 : i32 to index | |
| %7 = arith.index_castui %3 : i32 to index | |
| %8:4 = util.assume.int | |
| %4[<umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 156206912, umax = 156206912, udiv = 156206912>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 158172992, umax = 158172992, udiv = 158172992>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 152930112, umax = 152930112, udiv = 152930112>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>, <umin = 147687232, umax = 147687232, udiv = 147687232>], | |
| %5[<umin = 139413888, umax = 139413888, udiv = 139413888>, <umin = 174365376, umax = 174365376, udiv = 174365376>, <umin = 209316864, umax = 209316864, udiv = 209316864>, <umin = 244268352, umax = 244268352, udiv = 244268352>, <umin = 279219840, umax = 279219840, udiv = 279219840>, <umin = 314171328, umax = 314171328, udiv = 314171328>, <umin = 349122816, umax = 349122816, udiv = 349122816>, <umin = 384074304, umax = 384074304, udiv = 384074304>, <umin = 419025792, umax = 419025792, udiv = 419025792>, <umin = 453977280, umax = 453977280, udiv = 453977280>, <umin = 525045504, umax = 525045504, udiv = 525045504>, <umin = 559996992, umax = 559996992, udiv = 559996992>, <umin = 594948480, umax = 594948480, udiv = 594948480>, <umin = 629899968, umax = 629899968, udiv = 629899968>, <umin = 664851456, umax = 664851456, udiv = 664851456>, <umin = 699802944, umax = 699802944, udiv = 699802944>, <umin = 734754432, umax = 734754432, udiv = 734754432>, <umin = 769705920, umax = 769705920, udiv = 769705920>, <umin = 804657408, umax = 804657408, udiv = 804657408>, <umin = 839608896, umax = 839608896, udiv = 839608896>, <umin = 910677120, umax = 910677120, udiv = 910677120>, <umin = 945628608, umax = 945628608, udiv = 945628608>, <umin = 980580096, umax = 980580096, udiv = 980580096>, <umin = 1015531584, umax = 1015531584, udiv = 1015531584>, <umin = 1050483072, umax = 1050483072, udiv = 1050483072>, <umin = 1085434560, umax = 1085434560, udiv = 1085434560>, <umin = 1120386048, umax = 1120386048, udiv = 1120386048>, <umin = 1155337536, umax = 1155337536, udiv = 1155337536>, <umin = 1190289024, umax = 1190289024, udiv = 1190289024>, <umin = 1225240512, umax = 1225240512, udiv = 1225240512>, <umin = 1347163328, umax = 1347163328, udiv = 1347163328>, <umin = 1382114816, umax = 1382114816, udiv = 1382114816>, <umin = 1417066304, umax = 1417066304, udiv = 1417066304>, <umin = 1452017792, umax = 1452017792, udiv = 1452017792>, <umin = 1486969280, umax = 1486969280, udiv = 1486969280>, <umin = 1521920768, umax = 1521920768, udiv = 1521920768>, <umin = 1556872256, umax = 1556872256, udiv = 1556872256>, <umin = 1591823744, umax = 1591823744, udiv = 1591823744>, <umin = 1626775232, umax = 1626775232, udiv = 1626775232>, <umin = 1661726720, umax = 1661726720, udiv = 1661726720>, <umin = 1750840448, umax = 1750840448, udiv = 1750840448>, <umin = 1785791936, umax = 1785791936, udiv = 1785791936>, <umin = 1820743424, umax = 1820743424, udiv = 1820743424>, <umin = 1855694912, umax = 1855694912, udiv = 1855694912>, <umin = 1890646400, umax = 1890646400, udiv = 1890646400>, <umin = 1925597888, umax = 1925597888, udiv = 1925597888>, <umin = 1960549376, umax = 1960549376, udiv = 1960549376>, <umin = 1995500864, umax = 1995500864, udiv = 1995500864>, <umin = 2030452352, umax = 2030452352, udiv = 2030452352>, <umin = 2065403840, umax = 2065403840, udiv = 2065403840>, <umin = 2146320448, umax = 2146320448, udiv = 2146320448>, <umin = 2181271936, umax = 2181271936, udiv = 2181271936>, <umin = 2216223424, umax = 2216223424, udiv = 2216223424>, <umin = 2251174912, umax = 2251174912, udiv = 2251174912>, <umin = 2286126400, umax = 2286126400, udiv = 2286126400>, <umin = 2321077888, umax = 2321077888, udiv = 2321077888>, <umin = 2356029376, umax = 2356029376, udiv = 2356029376>, <umin = 2390980864, umax = 2390980864, udiv = 2390980864>, <umin = 2425932352, umax = 2425932352, udiv = 2425932352>, <umin = 2460883840, umax = 2460883840, udiv = 2460883840>], | |
| %6[<umin = 139424128, umax = 139424128, udiv = 139424128>, <umin = 174375616, umax = 174375616, udiv = 174375616>, <umin = 209327104, umax = 209327104, udiv = 209327104>, <umin = 244278592, umax = 244278592, udiv = 244278592>, <umin = 279230080, umax = 279230080, udiv = 279230080>, <umin = 314181568, umax = 314181568, udiv = 314181568>, <umin = 349133056, umax = 349133056, udiv = 349133056>, <umin = 384084544, umax = 384084544, udiv = 384084544>, <umin = 419036032, umax = 419036032, udiv = 419036032>, <umin = 453987520, umax = 453987520, udiv = 453987520>, <umin = 525055744, umax = 525055744, udiv = 525055744>, <umin = 560007232, umax = 560007232, udiv = 560007232>, <umin = 594958720, umax = 594958720, udiv = 594958720>, <umin = 629910208, umax = 629910208, udiv = 629910208>, <umin = 664861696, umax = 664861696, udiv = 664861696>, <umin = 699813184, umax = 699813184, udiv = 699813184>, <umin = 734764672, umax = 734764672, udiv = 734764672>, <umin = 769716160, umax = 769716160, udiv = 769716160>, <umin = 804667648, umax = 804667648, udiv = 804667648>, <umin = 839619136, umax = 839619136, udiv = 839619136>, <umin = 910687360, umax = 910687360, udiv = 910687360>, <umin = 945638848, umax = 945638848, udiv = 945638848>, <umin = 980590336, umax = 980590336, udiv = 980590336>, <umin = 1015541824, umax = 1015541824, udiv = 1015541824>, <umin = 1050493312, umax = 1050493312, udiv = 1050493312>, <umin = 1085444800, umax = 1085444800, udiv = 1085444800>, <umin = 1120396288, umax = 1120396288, udiv = 1120396288>, <umin = 1155347776, umax = 1155347776, udiv = 1155347776>, <umin = 1190299264, umax = 1190299264, udiv = 1190299264>, <umin = 1225250752, umax = 1225250752, udiv = 1225250752>, <umin = 1347173568, umax = 1347173568, udiv = 1347173568>, <umin = 1382125056, umax = 1382125056, udiv = 1382125056>, <umin = 1417076544, umax = 1417076544, udiv = 1417076544>, <umin = 1452028032, umax = 1452028032, udiv = 1452028032>, <umin = 1486979520, umax = 1486979520, udiv = 1486979520>, <umin = 1521931008, umax = 1521931008, udiv = 1521931008>, <umin = 1556882496, umax = 1556882496, udiv = 1556882496>, <umin = 1591833984, umax = 1591833984, udiv = 1591833984>, <umin = 1626785472, umax = 1626785472, udiv = 1626785472>, <umin = 1661736960, umax = 1661736960, udiv = 1661736960>, <umin = 1750850688, umax = 1750850688, udiv = 1750850688>, <umin = 1785802176, umax = 1785802176, udiv = 1785802176>, <umin = 1820753664, umax = 1820753664, udiv = 1820753664>, <umin = 1855705152, umax = 1855705152, udiv = 1855705152>, <umin = 1890656640, umax = 1890656640, udiv = 1890656640>, <umin = 1925608128, umax = 1925608128, udiv = 1925608128>, <umin = 1960559616, umax = 1960559616, udiv = 1960559616>, <umin = 1995511104, umax = 1995511104, udiv = 1995511104>, <umin = 2030462592, umax = 2030462592, udiv = 2030462592>, <umin = 2065414080, umax = 2065414080, udiv = 2065414080>, <umin = 2146330688, umax = 2146330688, udiv = 2146330688>, <umin = 2181282176, umax = 2181282176, udiv = 2181282176>, <umin = 2216233664, umax = 2216233664, udiv = 2216233664>, <umin = 2251185152, umax = 2251185152, udiv = 2251185152>, <umin = 2286136640, umax = 2286136640, udiv = 2286136640>, <umin = 2321088128, umax = 2321088128, udiv = 2321088128>, <umin = 2356039616, umax = 2356039616, udiv = 2356039616>, <umin = 2390991104, umax = 2390991104, udiv = 2390991104>, <umin = 2425942592, umax = 2425942592, udiv = 2425942592>, <umin = 2460894080, umax = 2460894080, udiv = 2460894080>], | |
| %7[<umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 129992512, umax = 129992512, udiv = 129992512>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 131958592, umax = 131958592, udiv = 131958592>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 95258432, umax = 95258432, udiv = 95258432>, <umin = 121472832, umax = 121472832, udiv = 121472832>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 42499392, umax = 42499392, udiv = 42499392>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 42499392, umax = 42499392, udiv = 42499392>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 42499392, umax = 42499392, udiv = 42499392>, <umin = 100501312, umax = 100501312, udiv = 100501312>, <umin = 42499392, umax = 42499392, udiv = 42499392>, <umin = 100501312, umax = 100501312, udiv = 100501312>] | |
| : index, index, index, index | |
| %9 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%8#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> | |
| %10 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#1) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<5120xf16>> | |
| %11 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<f32>> | |
| %12 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%8#3) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>> | |
| %13 = flow.dispatch.tensor.load %10, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5120xf16>> -> tensor<5120xf16> | |
| %14 = flow.dispatch.tensor.load %11, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> | |
| %15 = tensor.empty() : tensor<2x1024x5120xi8> | |
| %16 = flow.dispatch.tensor.load %9, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16> | |
| %17 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16> | |
| %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17, %16, %13, %14 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%15 : tensor<2x1024x5120xi8>) { | |
| ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8): | |
| %19 = math.sqrt %cst_3 : f16 | |
| %20 = arith.divf %in_4, %19 : f16 | |
| %21 = math.erf %20 : f16 | |
| %22 = arith.addf %21, %cst_2 : f16 | |
| %23 = arith.mulf %22, %cst_1 : f16 | |
| %24 = arith.mulf %in_4, %23 : f16 | |
| %25 = arith.mulf %in, %24 : f16 | |
| %26 = arith.mulf %25, %in_5 : f16 | |
| %27 = arith.truncf %in_6 : f32 to f16 | |
| %28 = arith.divf %26, %27 : f16 | |
| %29 = math.roundeven %28 : f16 | |
| %30 = arith.cmpf ult, %29, %cst_0 : f16 | |
| %31 = arith.select %30, %cst_0, %29 : f16 | |
| %32 = arith.cmpf ugt, %31, %cst : f16 | |
| %33 = arith.select %32, %cst, %31 : f16 | |
| %34 = arith.fptosi %33 : f16 to i8 | |
| linalg.yield %34 : i8 | |
| } -> tensor<2x1024x5120xi8> | |
| flow.dispatch.tensor.store %18, %12, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>> | |
| return | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @main$async_dispatch_58 { | |
| hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
| hal.executable.export public @main$async_dispatch_58_elementwise_2x4096x2560_f16xf16xf16xf32xi8 ordinal(0) layout(#hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
| ^bb0(%arg0: !hal.device): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @main$async_dispatch_58_elementwise_2x4096x2560_f16xf16xf16xf32xi8() { | |
| %cst = arith.constant 1.270000e+02 : f16 | |
| %cst_0 = arith.constant -1.280000e+02 : f16 | |
| %cst_1 = arith.constant 5.000000e-01 : f16 | |
| %cst_2 = arith.constant 1.000000e+00 : f16 | |
| %cst_3 = arith.constant 2.000000e+00 : f16 | |
| %0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
| %1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
| %2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
| %3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
| %4 = arith.index_castui %0 : i32 to index | |
| %5 = arith.index_castui %1 : i32 to index | |
| %6 = arith.index_castui %2 : i32 to index | |
| %7 = arith.index_castui %3 : i32 to index | |
| %8:4 = util.assume.int | |
| %4[<umin = 96069632, umax = 96069632, udiv = 96069632>, <umin = 106555392, umax = 106555392, udiv = 106555392>, <umin = 106555392, umax = 106555392, udiv = 106555392>, <umin = 117041152, umax = 117041152, udiv = 117041152>, <umin = 106555392, umax = 106555392, udiv = 106555392>, <umin = 117041152, umax = 117041152, udiv = 117041152>, <umin = 96069632, umax = 96069632, udiv = 96069632>, <umin = 106555392, umax = 106555392, udiv = 106555392>, <umin = 90826752, umax = 90826752, udiv = 90826752>, <umin = 101312512, umax = 101312512, udiv = 101312512>], | |
| %5[<umin = 37175296, umax = 37175296, udiv = 37175296>, <umin = 47280704, umax = 47280704, udiv = 47280704>, <umin = 67253888, umax = 67253888, udiv = 67253888>, <umin = 77359296, umax = 77359296, udiv = 77359296>, <umin = 2510394432, umax = 2510394432, udiv = 2510394432>, <umin = 2520499840, umax = 2520499840, udiv = 2520499840>, <umin = 2544990208, umax = 2544990208, udiv = 2544990208>, <umin = 2555095616, umax = 2555095616, udiv = 2555095616>, <umin = 2577535424, umax = 2577535424, udiv = 2577535424>, <umin = 2587640832, umax = 2587640832, udiv = 2587640832>], | |
| %6[<umin = 37180416, umax = 37180416, udiv = 37180416>, <umin = 47285824, umax = 47285824, udiv = 47285824>, <umin = 67259008, umax = 67259008, udiv = 67259008>, <umin = 77364416, umax = 77364416, udiv = 77364416>, <umin = 2510399552, umax = 2510399552, udiv = 2510399552>, <umin = 2520504960, umax = 2520504960, udiv = 2520504960>, <umin = 2544995328, umax = 2544995328, udiv = 2544995328>, <umin = 2555100736, umax = 2555100736, udiv = 2555100736>, <umin = 2577540544, umax = 2577540544, udiv = 2577540544>, <umin = 2587645952, umax = 2587645952, udiv = 2587645952>], | |
| %7[<umin = 179955712, umax = 179955712, udiv = 179955712>, <umin = 190441472, umax = 190441472, udiv = 190441472>, <umin = 190441472, umax = 190441472, udiv = 190441472>, <umin = 200927232, umax = 200927232, udiv = 200927232>, <umin = 190441472, umax = 190441472, udiv = 190441472>, <umin = 200927232, umax = 200927232, udiv = 200927232>, <umin = 179955712, umax = 179955712, udiv = 179955712>, <umin = 190441472, umax = 190441472, udiv = 190441472>, <umin = 174712832, umax = 174712832, udiv = 174712832>, <umin = 185198592, umax = 185198592, udiv = 185198592>] | |
| : index, index, index, index | |
| %9 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%8#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x4096x5120xf16>> | |
| %10 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#1) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2560xf16>> | |
| %11 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%8#2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<f32>> | |
| %12 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%8#3) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x2560xi8>> | |
| %13 = flow.dispatch.tensor.load %10, offsets = [0], sizes = [2560], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2560xf16>> -> tensor<2560xf16> | |
| %14 = flow.dispatch.tensor.load %11, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> | |
| %15 = tensor.empty() : tensor<2x4096x2560xi8> | |
| %16 = flow.dispatch.tensor.load %9, offsets = [0, 0, 2560], sizes = [2, 4096, 2560], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096x5120xf16>> -> tensor<2x4096x2560xf16> | |
| %17 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [2, 4096, 2560], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096x5120xf16>> -> tensor<2x4096x2560xf16> | |
| %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17, %16, %13, %14 : tensor<2x4096x2560xf16>, tensor<2x4096x2560xf16>, tensor<2560xf16>, tensor<f32>) outs(%15 : tensor<2x4096x2560xi8>) { | |
| ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8): | |
| %19 = math.sqrt %cst_3 : f16 | |
| %20 = arith.divf %in_4, %19 : f16 | |
| %21 = math.erf %20 : f16 | |
| %22 = arith.addf %21, %cst_2 : f16 | |
| %23 = arith.mulf %22, %cst_1 : f16 | |
| %24 = arith.mulf %in_4, %23 : f16 | |
| %25 = arith.mulf %in, %24 : f16 | |
| %26 = arith.mulf %25, %in_5 : f16 | |
| %27 = arith.truncf %in_6 : f32 to f16 | |
| %28 = arith.divf %26, %27 : f16 | |
| %29 = math.roundeven %28 : f16 | |
| %30 = arith.cmpf ult, %29, %cst_0 : f16 | |
| %31 = arith.select %30, %cst_0, %29 : f16 | |
| %32 = arith.cmpf ugt, %31, %cst : f16 | |
| %33 = arith.select %32, %cst, %31 : f16 | |
| %34 = arith.fptosi %33 : f16 to i8 | |
| linalg.yield %34 : i8 | |
| } -> tensor<2x4096x2560xi8> | |
| flow.dispatch.tensor.store %18, %12, offsets = [0, 0, 0], sizes = [2, 4096, 2560], strides = [1, 1, 1] : tensor<2x4096x2560xi8> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x2560xi8>> | |
| return | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment