pashu123’s gists

pashu123 / argmax.cc

Created April 27, 2025 21:03

	#include <hip/hip_runtime.h>
	#include <hip/hip_bf16.h>
	#include <hip/hip_cooperative_groups.h>
	#include <iostream>
	#include <vector>
	#include <cstdlib>
	#include <cstdint>

	// Cooperative-groups namespace
	namespace cg = cooperative_groups;

pashu123 / argmax.cc

Created April 27, 2025 20:53

	#include <hip/hip_runtime.h>
	#include <hip/hip_bf16.h>
	#include <hip/hip_cooperative_groups.h>
	#include <iostream>
	#include <vector>
	#include <cstdlib>
	#include <cstdint>

	// Cooperative-groups namespace
	namespace cg = cooperative_groups;

pashu123 / vect_dist_llvm_before.txt

Created April 24, 2025 19:35

	// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
	module {
	func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
	%cst = arith.constant 0.000000e+00 : f32
	%c0_i32 = arith.constant 0 : i32
	%c256_i32 = arith.constant 256 : i32
	%c32_i32 = arith.constant 32 : i32
	%c16_i32 = arith.constant 16 : i32
	%c8_i32 = arith.constant 8 : i32
	%c4_i32 = arith.constant 4 : i32

pashu123 / module_faulty_dispatch_0_rocm_hsaco_fb.rocmasm

Created April 24, 2025 17:56

	.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
	.amdhsa_code_object_version 5
	.text
	.globl faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
	.p2align 8
	.type faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32,@function
	faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32:
	s_load_dwordx2 s[2:3], s[0:1], 0x0
	s_load_dwordx4 s[4:7], s[0:1], 0x8
	s_waitcnt lgkmcnt(0)

pashu123 / module_faulty_dispatch_0_rocm_hsaco_fb.rocmasm

Created April 24, 2025 17:48

	.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
	.amdhsa_code_object_version 5
	.text
	.globl faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
	.p2align 8
	.type faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32,@function
	faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32:
	s_load_dwordx2 s[2:3], s[0:1], 0x0
	s_load_dwordx4 s[4:7], s[0:1], 0x8
	s_waitcnt lgkmcnt(0)

pashu123 / gen.py

Created April 24, 2025 01:02

	import numpy as np
	import os

	np.random.seed(0)

	# Tensor shapes
	shapes = {
	"arg0": (2816, 2),
	"arg1": (1280, 2816),
	}

pashu123 / warp.txt

Created April 23, 2025 23:46

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1, d2) -> (d2, d0)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	module {
	func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2816x2xf16>, tensor<1280x2816xf16>) outs(%1 : tensor<2x1280xf32>) {

pashu123 / vecd.txt

Created April 23, 2025 23:46

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1, d2) -> (d2, d0)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	module {
	func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2816x2xf16>, tensor<1280x2816xf16>) outs(%1 : tensor<2x1280xf32>) {

pashu123 / vecd.txt

Created April 23, 2025 23:46

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1, d2) -> (d2, d0)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	module {
	func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2816x2xf16>, tensor<1280x2816xf16>) outs(%1 : tensor<2x1280xf32>) {

pashu123 / cc.sh

Created April 23, 2025 23:41

echo "compiling the old vmfb"

iree-compile --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-preprocessing-pass-pipeline="builtin.module(util.func(iree-global-opt-raise-special-ops, iree-flow-canonicalize), iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics, util.func(iree-preprocessing-generalize-linalg-matmul-experimental))" --iree-hal-dump-executable-files-to=dump/ --iree-dispatch-creation-enable-aggressive-fusion --iree-dispatch-creation-enable-fuse-horizontal-contractions=false --iree-opt-aggressively-propagate-transposes=true --iree-codegen-llvmgpu-use-vector-distribution=true --iree-opt-data-tiling=false --iree-vm-target-truncate-unsupported-floats --iree-opt-outer-dim-concat=true  --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true --iree-opt-strip-assertions --iree-global-opt-propagate-transposes=true --iree-opt-const-eval=false --i

Prashant Kumar pashu123