Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
module {
func.func @flash_attention_func(%arg0: !torch.vtensor<[32,8,?,32],f16>, %arg1: !torch.vtensor<[32,8,?,32],f16>, %arg2: !torch.vtensor<[32,8,?,32],f16>) -> (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>) {
%float0.000000e00 = torch.constant.float 0.000000e+00
%true = torch.constant.bool true
%none = torch.constant.none
%none_0 = torch.constant.none
%0:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%arg0, %arg1, %arg2, %float0.000000e00, %true, %none, %none_0) : (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?,32],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>)
return %0#0, %0#1 : !torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>
}
}
func.func @softy_dispatch_0_softmax_4x128256xf32_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 0xFFC00000 : f32
%c32_i64 = arith.constant 32 : i64
%c1_i64 = arith.constant 1 : i64
%c0_i64 = arith.constant 0 : i64
%c4_i64 = arith.constant 4 : i64
%cst_1 = arith.constant dense_resource<torch_tensor_4_torch.int64> : tensor<4xi64>
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
hal.executable public @prefill_bs4$async_dispatch_3 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts =
#map = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
#map1 = affine_map<()[s0, s1] -> (s0 + s1)>
#map2 = affine_map<(d0) -> (d0)>
#map3 = affine_map<(d0) -> ()>
#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [4, 1, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>
module {
func.func @dynamic_softmax() attributes {translation_info = #translation} {
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.bindi
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.bindi
#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
module {
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
%c0 = arith.
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>