pashu123’s gists

pashu123 / repro_logs.txt

Created May 21, 2025 21:03

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1) -> (d0, d1)>
	module {
	func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
	%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>

pashu123 / 8b_fp8.mlir

Created May 14, 2025 22:58

This file has been truncated, but you can view the full file.

	#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
	#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>
	#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)>
	#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
	module @module {
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
	util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
	util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>

pashu123 / module_decode_bs4$async_dispatch_28.mlir

Created May 14, 2025 22:05

hal.executable public @decode_bs4$async_dispatch_28 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts =

pashu123 / dis.mlir

Created May 13, 2025 22:59

	func.func @prefill_bs4$async_dispatch_20_elementwise_4xDx4096_bf16xf32xf32xf32xf32xf8E4M3FNUZ() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%c32 = arith.constant 32 : index
	%c67108864 = arith.constant 67108864 : index
	%c32_i64 = arith.constant 32 : i64
	%c2_i64 = arith.constant 2 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant 4.096000e+03 : f32
	%cst_1 = arith.constant 9.99999974E-6 : f32
	%cst_2 = arith.constant -2.400000e+02 : f32
	%cst_3 = arith.constant 2.400000e+02 : f32

pashu123 / matmul.mlir

Created May 10, 2025 05:19

	#map = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {
	^bb0(%in: f16, %in_0: f16, %out: f32):
	%3 = arith.extf %in : f16 to f32

pashu123 / ir_dump.txt

Created May 10, 2025 05:18

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	module {
	func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {

pashu123 / matmul.mlir

Created May 10, 2025 04:44

	#map = affine_map<(d0, d1, d2) -> (d0, d1)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
	module {
	func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<2x1280xf32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
	%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {
	^bb0(%in: f16, %in_0: f16, %out: f32):

pashu123 / xyz.mlir

Created May 8, 2025 07:14

	func.func @softmax_4d_dispatch_0_softmax_1x1x16384x16384xf32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%cst = arith.constant dense<0.000000e+00> : vector<2048xf32>
	%cst_0 = arith.constant dense<0xFFC00000> : vector<2048xf32>
	%c2048 = arith.constant 2048 : index
	%c16384 = arith.constant 16384 : index
	%cst_1 = arith.constant 0.000000e+00 : f32
	%cst_2 = arith.constant 0xFFC00000 : f32
	%c0 = arith.constant 0 : index
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : memref<1x1x16384x16384xf32, #hal.descriptor_typ

pashu123 / just_test.mlir

Created May 5, 2025 22:13

	func.func @vector_expansion(
	%47: vector<1xf32>
	) -> f32 {
	%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
	%cst_3 = arith.constant 0.000000e+00 : f32
	%48 = vector.extract %47[0] : f32 from vector<1xf32>
	%49 = gpu.subgroup_reduce add %48 cluster(size = 8) : (f32) -> f32
	%50 = vector.insert %49, %cst_0 [0] : f32 into vector<1xf32>
	%51 = vector.shape_cast %50 : vector<1xf32> to vector<1xf32>
	%52 = vector.broadcast %cst_3 : f32 to vector<1xf32>

pashu123 / arg.cc

Created April 27, 2025 22:54

	#include <hip/hip_runtime.h>
	#include <hip/hip_bf16.h>
	#include <iostream>
	#include <vector>
	#include <random>

	#define HIP_CHECK(err) hipAssert(err, __FILE__, __LINE__)
	inline void hipAssert(hipError_t err, const char* file, int line) {
	if (err != hipSuccess) {
	std::cerr << "HIP error: " << hipGetErrorString(err) << " at " << file << ":" << line << std::endl;

Prashant Kumar pashu123