pashu123’s gists

pashu123 / test_attention.mlir

Created June 17, 2025 18:02

	module {
	func.func @flash_attention_func(%arg0: !torch.vtensor<[32,8,?,32],f16>, %arg1: !torch.vtensor<[32,8,?,32],f16>, %arg2: !torch.vtensor<[32,8,?,32],f16>) -> (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>) {
	%float0.000000e00 = torch.constant.float 0.000000e+00
	%true = torch.constant.bool true
	%none = torch.constant.none
	%none_0 = torch.constant.none
	%0:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%arg0, %arg1, %arg2, %float0.000000e00, %true, %none, %none_0) : (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?,32],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>)
	return %0#0, %0#1 : !torch.vtensor<[32,8,?,32],f16>, !torch.vtensor<[32,8,?],f32>
	}
	}

pashu123 / after_tile_and_distribute.mlir

Created June 17, 2025 14:22

	func.func @softy_dispatch_0_softmax_4x128256xf32_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant 0xFFC00000 : f32
	%c32_i64 = arith.constant 32 : i64
	%c1_i64 = arith.constant 1 : i64
	%c0_i64 = arith.constant 0 : i64
	%c4_i64 = arith.constant 4 : i64
	%cst_1 = arith.constant dense_resource<torch_tensor_4_torch.int64> : tensor<4xi64>
	%c0 = arith.constant 0 : index
	%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32

pashu123 / module_prefill_bs4$async_dispatch_3.mlir

Created June 17, 2025 08:05

hal.executable public @prefill_bs4$async_dispatch_3 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts =

pashu123 / dynamic_softmax.mlir

Created June 4, 2025 18:08

	#map = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
	#map1 = affine_map<()[s0, s1] -> (s0 + s1)>
	#map2 = affine_map<(d0) -> (d0)>
	#map3 = affine_map<(d0) -> ()>
	#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
	#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [4, 1, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>
	module {
	func.func @dynamic_softmax() attributes {translation_info = #translation} {
	%c8 = arith.constant 8 : index
	%c1 = arith.constant 1 : index

pashu123 / full.txt

Created May 22, 2025 21:08

	// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
	#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
	#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
	#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
	#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.bindi

pashu123 / full.txt

Created May 22, 2025 20:45

	// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
	#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
	#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
	#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
	#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.bindi

pashu123 / test_forall.mlir

Created May 22, 2025 20:42

	#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
	#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	module {
	func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
	%c0 = arith.

pashu123 / repro_logs.txt

Created May 21, 2025 23:11

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1) -> (d0, d1)>
	module {
	func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
	%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>

pashu123 / test.mlir

Created May 21, 2025 21:23

	#map = affine_map<(d0, d1) -> (d0, d1)>
	module {
	func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
	%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>

pashu123 / repro_logs.txt

Created May 21, 2025 21:22

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1) -> (d0, d1)>
	module {
	func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
	%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>

Prashant Kumar pashu123