pashu123’s gists

pashu123 / sweep_matmul.py

Created April 9, 2025 17:07

	def generate_mlir(m, n, k):
	# Define the MLIR types
	matA_type = f"tensor<{m}x{k}xf16>"
	matB_type = f"tensor<{n}x{k}xf16>"
	matCF32_type = f"tensor<{m}x{n}xf32>"

	file_name = f"file_{m}_{n}_{k}.mlir"

	# Generate the MLIR function
	mlir_code = f"""

pashu123 / err.txt

Created April 9, 2025 02:55

This file has been truncated, but you can view the full file.

	// -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
	func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
	%int6 = torch.constant.int 6
	%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
	%int2 = torch.constant.int 2
	%int3 = torch.constant.int 3
	%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
	%int0 = torch.constant.int 0
	%true = torch.constant.bool true
	%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>

pashu123 / vecred.mlir

Created April 3, 2025 12:45

	func.func @matvec_dispatch_0_matmul_transpose_b_32000x1x4096_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
	%cst = arith.constant 0.000000e+00 : f16
	%cst_0 = arith.constant dense<0.000000e+00> : vector<16x1x512xf32>
	%cst_1 = arith.constant dense<0.000000e+00> : vector<16x1xf32>
	%c4096 = arith.constant 4096 : index
	%c0 = arith.constant 0 : index
	%c512 = arith.constant 512 : index
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
	%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<32000x4096xf16, #amdgp

pashu123 / newir.mlir

Created March 28, 2025 07:12

	// -----// IR Dump After CSE (cse) //----- //
	func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%cst = arith.constant dense<0.000000e+00> : vector<1x1x16xf32>
	%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
	%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
	%c0 = arith.constant 0 : index
	%cst_2 = arith.constant 1.638400e+05 : f32
	%cst_3 = arith.constant 0.000000e+00 : f32
	%c1 = arith.constant 1 : index
	%c40 = arith.constant 40 : index

pashu123 / err.txt

Created March 25, 2025 13:01

This file has been truncated, but you can view the full file.

// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //

#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgrou

pashu123 / cc.sh

Created March 25, 2025 07:51

/home/prashant/iree/.venv/bin/iree-compile --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-preprocessing-pass-pipeline="builtin.module(util.func(iree-global-opt-raise-special-ops, iree-flow-canonicalize), iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics, util.func(iree-preprocessing-generalize-linalg-matmul-experimental))" --iree-hal-dump-executable-files-to=dump/ --iree-dispatch-creation-enable-aggressive-fusion --iree-dispatch-creation-enable-fuse-horizontal-contractions=false --iree-opt-aggressively-propagate-transposes=true --iree-codegen-llvmgpu-use-vector-distribution=true --iree-opt-data-tiling=false --iree-vm-target-truncate-unsupported-floats --iree-opt-outer-dim-concat=true --iree-codegen-gpu-native-math-precision=true --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true --iree-opt-strip-assertions --iree-global-opt-propagate-tr

pashu123 / compiled_punet_main$async_dispatch_140.mlir

Created March 22, 2025 15:06

hal.executable public @main$async_dispatch_140 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [214

pashu123 / np_attn.py

Created March 15, 2025 10:27

	import numpy as np


	def attention(Q, K, V):
	"""
	Computes attention: softmax(QK^T)V

	Args:
	Q: Query matrix of shape (batch_size, seq_len_q, d)
	K: Key matrix of shape (batch_size, seq_len_k, d)

pashu123 / check.mlir

Created March 10, 2025 17:04

	#map = affine_map<(d0, d1) -> (d0, d1)>
	#map1 = affine_map<(d0, d1) -> (d1, d0)>
	module {
	func.func @matmul_add_transpose(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32> {
	%0 = tensor.empty() : tensor<4x4xf32>
	%c2 = arith.constant 2 : index
	%c4 = arith.constant 4 : index
	%c0 = arith.constant 0 : index
	%1 = tensor.empty() : tensor<4x4xf32>
	%2 = scf.for %arg2 = %c0 to %c4 step %c2 iter_args(%arg3 = %1) -> (tensor<4x4xf32>) {

pashu123 / .zsh_history

Created March 10, 2025 16:59

	: 1709405007:0;git clone --depth 1 https://github.com/junegunn/fzf.git ~/.fzf\
	~/.fzf/install
	: 1709405026:0;source ~/.zshrcs
	: 1709405061:0;hipcc --version
	: 1709405083:0;hipcc -o3
	: 1709405161:0;sudo dnf install raedontop
	: 1709405239:0;sudo dnf install cargo
	: 1709405609:0;sudo apt-get intall lld
	: 1709405648:0;sudo dnf install libdrm-dev
	: 1709405653:0;sudo dnf install libdrm-devel

Prashant Kumar pashu123