Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
This file has been truncated, but you can view the full file.
// -----// IR Dump After ConvertTorchOnnxToTorch (convert-torch-onnx-to-torch) //----- //
func.func @torch_jit(%arg0: !torch.vtensor<[1,128,4,256],f32>) -> !torch.vtensor<[1,257,4,256],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "1.12.1"} {
%0 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided__> : tensor<257x128x1x1xf32>) : !torch.vtensor<[257,128,1,1],f32>
%1 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___1> : tensor<257xf32>) : !torch.vtensor<[257],f32>
%int0 = torch.constant.int 0
%int0_0 = torch.constant.int 0
%2 = torch.prim.ListConstruct %int0, %int0_0 : (!torch.int, !torch.int) -> !torch.list<int>
%int1 = torch.constant.int 1
%int1_1 = torch.constant.int 1
%int1_2 = torch.constant.int 1
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @unaligned_k(%arg0: tensor<128x258xf32>, %arg1: tensor<258x256xf32>) -> tensor<128x256xf32> {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<128x256xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128x256xf32>) -> tensor<128x256xf32>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x258xf32>, tensor<258x256xf32>) outs(%0 : tensor<128x256xf32>) -> tensor<128x256xf32>
return %2 : tensor<128x256xf32>
}
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-am
hal.executable public @prefill_bs4$async_dispatch_122 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16 ordinal(0) layout(#hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16() {
%c32_i64 = arith.constant 32 : i64
failed to translate executables
prefill_8b_tp8.mlir:9903:13: error: One or more operations with large vector sizes (8192 bytes) were found:
%3425 = torch.aten.transpose.int %3417#0, %int1_1244, %int2_1245 : !torch.vtensor<[4,4,?,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[4,?,4,128],f16>
^
prefill_8b_tp8.mlir:9857:15: note: %67 = vector.transfer_read %extracted_slice_8[%c0, %c0, %c0, %c0, %c0, %c0], %cst_3, %66 {in_bounds = [true, true, true, true, true, true]} : tensor<4x1x?x1x1x128xf16>, vector<4x1x8x1x1x128xf16>
%3417:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3393, %3401, %3409, %float0.000000e00, %true_1213, %none_1214, %none_1215) : (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?],f32>)
^
prefill_8b_tp8.mlir:9857:15: note: %69 = arith.extf %67 : vector<4x1x8x1
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst = arith.constant 0.000000e+00 : f32
%2 = tensor.empty() : tensor<2048x512xf32>
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
return %4 : tensor<2048x512xf32>
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst = arith.constant 0.000000e+00 : f32
%2 = tensor.empty() : tensor<2048x512xf32>
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
return %4 : tensor<2048x512xf32>
==2213870==ERROR: AddressSanitizer: heap-use-after-free on address 0x50e000021d2c at pc 0x7b499bfdc2d7 bp 0x7fff58a95a70 sp 0x7fff58a95a68
READ of size 4 at 0x50e000021d2c thread T0
#0 0x7b499bfdc2d6 in mlir::Operation::getRegions() /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:674:9
#1 0x7b499bfdc2d6 in mlir::ForwardIterator::makeIterable(mlir::Operation&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Visitors.cpp:18:16
#2 0x7b499bd783f1 in void mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<void (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:176:23
#3 0x7b49a160111e in std::enable_if<!llvm::is_one_of<mlir::gpu::ThreadIdOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<void, void>::value, void>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::Rewr
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @dot_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
%3 = flow.di
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
#translation = #iree_codegen.translation_info<CPUDefault>
module {
func.func @quantized_matmul_neither_zp_0_dynamic(%arg0: tensor<256x256xi8>, %arg1: tensor<256x256xi8>, %arg2: i32, %arg3: i32, %arg4: tensor<256x256xi32>) -> tensor<256x256xi32> attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
%0 = linalg.quantized_matmul ins(%arg0, %arg1, %arg2, %arg3 : tensor<256x256xi8>, tensor<256x256xi8>, i32, i32) outs(%arg4 : tensor<256x256xi32>) -> tensor<256x256xi32>
return %0 : tensor<256x256xi32>
}
}