Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
#include <hip/hip_runtime.h>
#include <hip/hip_bf16.h>
#include <iostream>
#include <vector>
#include <random>
#define HIP_CHECK(err) hipAssert(err, __FILE__, __LINE__)
inline void hipAssert(hipError_t err, const char* file, int line) {
if (err != hipSuccess) {
std::cerr << "HIP error: " << hipGetErrorString(err) << " at " << file << ":" << line << std::endl;
#include <hip/hip_runtime.h>
#include <hip/hip_bf16.h>
#include <hip/hip_cooperative_groups.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdint>
// Cooperative-groups namespace
namespace cg = cooperative_groups;
#include <hip/hip_runtime.h>
#include <hip/hip_bf16.h>
#include <hip/hip_cooperative_groups.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdint>
// Cooperative-groups namespace
namespace cg = cooperative_groups;
// -----// IR Dump After DropCompilerHintsPass (iree-util-drop-compiler-hints) //----- //
module {
func.func @faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32() {
%cst = arith.constant 0.000000e+00 : f32
%c0_i32 = arith.constant 0 : i32
%c256_i32 = arith.constant 256 : i32
%c32_i32 = arith.constant 32 : i32
%c16_i32 = arith.constant 16 : i32
%c8_i32 = arith.constant 8 : i32
%c4_i32 = arith.constant 4 : i32
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.p2align 8
.type faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32,@function
faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx4 s[4:7], s[0:1], 0x8
s_waitcnt lgkmcnt(0)
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.p2align 8
.type faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32,@function
faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx4 s[4:7], s[0:1], 0x8
s_waitcnt lgkmcnt(0)
import numpy as np
import os
np.random.seed(0)
# Tensor shapes
shapes = {
"arg0": (2816, 2),
"arg1": (1280, 2816),
}
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1, d2) -> (d2, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1, d2) -> (d2, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2816x2xf16>, tensor<1280x2816xf16>) outs(%1 : tensor<2x1280xf32>) {
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1, d2) -> (d2, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @faulty(%arg0: tensor<2816x2xf16>, %arg1: tensor<1280x2816xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2816x2xf16>, tensor<1280x2816xf16>) outs(%1 : tensor<2x1280xf32>) {