Created
April 17, 2024 17:10
-
-
Save pashu123/a291c7cfc6d2d47930234bf3257d46f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0 { | |
| hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
| hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 6, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_0_pack_f32() { | |
| %c0 = arith.constant 0 : index | |
| %c32_i64 = arith.constant 32 : i64 | |
| %cst = arith.constant 0.000000e+00 : f32 | |
| %0 = hal.interface.constant.load[0] : i32 | |
| %1 = hal.interface.constant.load[1] : i32 | |
| %2 = hal.interface.constant.load[2] : i32 | |
| %3 = hal.interface.constant.load[3] : i32 | |
| %4 = hal.interface.constant.load[4] : i32 | |
| %5 = hal.interface.constant.load[5] : i32 | |
| %6 = arith.extui %0 : i32 to i64 | |
| %7 = arith.extui %1 : i32 to i64 | |
| %8 = arith.shli %7, %c32_i64 : i64 | |
| %9 = arith.ori %6, %8 : i64 | |
| %10 = arith.index_castui %9 : i64 to index | |
| %11 = arith.extui %2 : i32 to i64 | |
| %12 = arith.extui %3 : i32 to i64 | |
| %13 = arith.shli %12, %c32_i64 : i64 | |
| %14 = arith.ori %11, %13 : i64 | |
| %15 = arith.index_castui %14 : i64 to index | |
| %16 = arith.extui %4 : i32 to i64 | |
| %17 = arith.extui %5 : i32 to i64 | |
| %18 = arith.shli %17, %c32_i64 : i64 | |
| %19 = arith.ori %16, %18 : i64 | |
| %20 = arith.index_castui %19 : i64 to index | |
| %21 = flow.dispatch.workload.ordinal %10, 0 : index | |
| %22 = flow.dispatch.workload.ordinal %15, 1 : index | |
| %23 = flow.dispatch.workload.ordinal %20, 2 : index | |
| %24 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x3200xf32>>{%22, %21} | |
| %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x3200x16x1xf32>>{%22, %23} | |
| %26 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [%22, %21, 3200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x3200xf32>>{%22, %21} -> tensor<?x?x3200xf32> | |
| %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21] | |
| %28 = tensor.empty(%22, %27) : tensor<?x?x3200x16x1xf32> | |
| %pack = tensor.pack %26 padding_value(%cst : f32) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %28 : tensor<?x?x3200xf32> -> tensor<?x?x3200x16x1xf32> | |
| flow.dispatch.tensor.store %pack, %25, offsets = [0, 0, 0, 0, 0], sizes = [%22, %23, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x?x3200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x3200x16x1xf32>>{%22, %23} | |
| return | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1 { | |
| hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
| hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
| ^bb0(%arg0: !hal.device, %arg1: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack() { | |
| %c0 = arith.constant 0 : index | |
| %c32_i64 = arith.constant 32 : i64 | |
| %0 = hal.interface.constant.load[0] : i32 | |
| %1 = hal.interface.constant.load[1] : i32 | |
| %2 = hal.interface.constant.load[2] : i32 | |
| %3 = hal.interface.constant.load[3] : i32 | |
| %4 = arith.extui %0 : i32 to i64 | |
| %5 = arith.extui %1 : i32 to i64 | |
| %6 = arith.shli %5, %c32_i64 : i64 | |
| %7 = arith.ori %4, %6 : i64 | |
| %8 = arith.index_castui %7 : i64 to index | |
| %9 = arith.extui %2 : i32 to i64 | |
| %10 = arith.extui %3 : i32 to i64 | |
| %11 = arith.shli %10, %c32_i64 : i64 | |
| %12 = arith.ori %9, %11 : i64 | |
| %13 = arith.index_castui %12 : i64 to index | |
| %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> | |
| %15 = flow.dispatch.workload.ordinal %13, 0 : index | |
| %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15} | |
| %17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16> | |
| %18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16> | |
| %19 = tensor.empty(%15) : tensor<?x8640x3200xf16> | |
| %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) { | |
| ^bb0(%in: f16, %out: f16): | |
| linalg.yield %in : f16 | |
| } -> tensor<?x8640x3200xf16> | |
| %pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16> | |
| flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15} | |
| return | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2 { | |
| hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
| hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 10, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32() { | |
| %c0 = arith.constant 0 : index | |
| %c32_i64 = arith.constant 32 : i64 | |
| %cst = arith.constant 0.000000e+00 : f32 | |
| %0 = hal.interface.constant.load[0] : i32 | |
| %1 = hal.interface.constant.load[1] : i32 | |
| %2 = hal.interface.constant.load[2] : i32 | |
| %3 = hal.interface.constant.load[3] : i32 | |
| %4 = hal.interface.constant.load[4] : i32 | |
| %5 = hal.interface.constant.load[5] : i32 | |
| %6 = hal.interface.constant.load[6] : i32 | |
| %7 = hal.interface.constant.load[7] : i32 | |
| %8 = hal.interface.constant.load[8] : i32 | |
| %9 = hal.interface.constant.load[9] : i32 | |
| %10 = arith.extui %0 : i32 to i64 | |
| %11 = arith.extui %1 : i32 to i64 | |
| %12 = arith.shli %11, %c32_i64 : i64 | |
| %13 = arith.ori %10, %12 : i64 | |
| %14 = arith.index_castui %13 : i64 to index | |
| %15 = arith.extui %2 : i32 to i64 | |
| %16 = arith.extui %3 : i32 to i64 | |
| %17 = arith.shli %16, %c32_i64 : i64 | |
| %18 = arith.ori %15, %17 : i64 | |
| %19 = arith.index_castui %18 : i64 to index | |
| %20 = arith.extui %4 : i32 to i64 | |
| %21 = arith.extui %5 : i32 to i64 | |
| %22 = arith.shli %21, %c32_i64 : i64 | |
| %23 = arith.ori %20, %22 : i64 | |
| %24 = arith.index_castui %23 : i64 to index | |
| %25 = arith.extui %6 : i32 to i64 | |
| %26 = arith.extui %7 : i32 to i64 | |
| %27 = arith.shli %26, %c32_i64 : i64 | |
| %28 = arith.ori %25, %27 : i64 | |
| %29 = arith.index_castui %28 : i64 to index | |
| %30 = arith.extui %8 : i32 to i64 | |
| %31 = arith.extui %9 : i32 to i64 | |
| %32 = arith.shli %31, %c32_i64 : i64 | |
| %33 = arith.ori %30, %32 : i64 | |
| %34 = arith.index_castui %33 : i64 to index | |
| %35 = flow.dispatch.workload.ordinal %29, 1 : index | |
| %36 = flow.dispatch.workload.ordinal %34, 2 : index | |
| %37 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x3200x16x1xf32>>{%35, %36} | |
| %38 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%14) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x540x3200x16x1xf16>>{%35} | |
| %39 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%19) : !flow.dispatch.tensor<writeonly:tensor<?x?x540x16x16xf32>>{%35, %36} | |
| %40 = flow.dispatch.workload.ordinal %24, 0 : index | |
| %41 = flow.dispatch.tensor.load %37, offsets = [0, 0, 0, 0, 0], sizes = [%35, %36, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x3200x16x1xf32>>{%35, %36} -> tensor<?x?x3200x16x1xf32> | |
| %42 = flow.dispatch.tensor.load %38, offsets = [0, 0, 0, 0, 0], sizes = [%35, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x540x3200x16x1xf16>>{%35} -> tensor<?x540x3200x16x1xf16> | |
| %43 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%40] | |
| %44 = tensor.empty(%35, %43) : tensor<?x?x540x16x16xf32> | |
| %45 = linalg.fill ins(%cst : f32) outs(%44 : tensor<?x?x540x16x16xf32>) -> tensor<?x?x540x16x16xf32> | |
| %46 = linalg.batch_mmt4d ins(%41, %42 : tensor<?x?x3200x16x1xf32>, tensor<?x540x3200x16x1xf16>) outs(%45 : tensor<?x?x540x16x16xf32>) -> tensor<?x?x540x16x16xf32> | |
| flow.dispatch.tensor.store %46, %39, offsets = [0, 0, 0, 0, 0], sizes = [%35, %36, 540, 16, 16], strides = [1, 1, 1, 1, 1] : tensor<?x?x540x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x540x16x16xf32>>{%35, %36} | |
| return | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3 { | |
| hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
| hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_3_unpack_f32() { | |
| %c32_i64 = arith.constant 32 : i64 | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.constant.load[0] : i32 | |
| %1 = hal.interface.constant.load[1] : i32 | |
| %2 = hal.interface.constant.load[2] : i32 | |
| %3 = hal.interface.constant.load[3] : i32 | |
| %4 = hal.interface.constant.load[4] : i32 | |
| %5 = hal.interface.constant.load[5] : i32 | |
| %6 = hal.interface.constant.load[6] : i32 | |
| %7 = hal.interface.constant.load[7] : i32 | |
| %8 = arith.extui %0 : i32 to i64 | |
| %9 = arith.extui %1 : i32 to i64 | |
| %10 = arith.shli %9, %c32_i64 : i64 | |
| %11 = arith.ori %8, %10 : i64 | |
| %12 = arith.index_castui %11 : i64 to index | |
| %13 = arith.extui %2 : i32 to i64 | |
| %14 = arith.extui %3 : i32 to i64 | |
| %15 = arith.shli %14, %c32_i64 : i64 | |
| %16 = arith.ori %13, %15 : i64 | |
| %17 = arith.index_castui %16 : i64 to index | |
| %18 = arith.extui %4 : i32 to i64 | |
| %19 = arith.extui %5 : i32 to i64 | |
| %20 = arith.shli %19, %c32_i64 : i64 | |
| %21 = arith.ori %18, %20 : i64 | |
| %22 = arith.index_castui %21 : i64 to index | |
| %23 = arith.extui %6 : i32 to i64 | |
| %24 = arith.extui %7 : i32 to i64 | |
| %25 = arith.shli %24, %c32_i64 : i64 | |
| %26 = arith.ori %23, %25 : i64 | |
| %27 = arith.index_castui %26 : i64 to index | |
| %28 = flow.dispatch.workload.ordinal %17, 0 : index | |
| %29 = flow.dispatch.workload.ordinal %22, 1 : index | |
| %30 = flow.dispatch.workload.ordinal %27, 2 : index | |
| %31 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%12) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x540x16x16xf32>>{%29, %28} | |
| %32 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x8640xf32>>{%29, %30} | |
| %33 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0, 0, 0], sizes = [%29, %28, 540, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x540x16x16xf32>>{%29, %28} -> tensor<?x?x540x16x16xf32> | |
| %34 = tensor.empty(%29, %30) : tensor<?x?x8640xf32> | |
| %unpack = tensor.unpack %33 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %34 : tensor<?x?x540x16x16xf32> -> tensor<?x?x8640xf32> | |
| flow.dispatch.tensor.store %unpack, %32, offsets = [0, 0, 0], sizes = [%29, %30, 8640], strides = [1, 1, 1] : tensor<?x?x8640xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8640xf32>>{%29, %30} | |
| return | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment