Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @foo(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>
This file has been truncated, but you can view the full file.
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)>
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)>
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)>
#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
hal.executable public @decode_bs4$async_dispatch_28 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts =
func.func @prefill_bs4$async_dispatch_20_elementwise_4xDx4096_bf16xf32xf32xf32xf32xf8E4M3FNUZ() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
%c32 = arith.constant 32 : index
%c67108864 = arith.constant 67108864 : index
%c32_i64 = arith.constant 32 : i64
%c2_i64 = arith.constant 2 : i64
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 4.096000e+03 : f32
%cst_1 = arith.constant 9.99999974E-6 : f32
%cst_2 = arith.constant -2.400000e+02 : f32
%cst_3 = arith.constant 2.400000e+02 : f32
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {
^bb0(%in: f16, %in_0: f16, %out: f32):
%3 = arith.extf %in : f16 to f32
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {
#map = affine_map<(d0, d1, d2) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func.func @matmul(%arg0: tensor<2x2816xf16>, %arg1: tensor<2816x1280xf16>) -> tensor<2x1280xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x1280xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x1280xf32>) -> tensor<2x1280xf32>
%2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x2816xf16>, tensor<2816x1280xf16>) outs(%1 : tensor<2x1280xf32>) {
^bb0(%in: f16, %in_0: f16, %out: f32):
func.func @softmax_4d_dispatch_0_softmax_1x1x16384x16384xf32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<2048xf32>
%cst_0 = arith.constant dense<0xFFC00000> : vector<2048xf32>
%c2048 = arith.constant 2048 : index
%c16384 = arith.constant 16384 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 0xFFC00000 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1x1x16384x16384xf32, #hal.descriptor_typ
func.func @vector_expansion(
%47: vector<1xf32>
) -> f32 {
%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
%cst_3 = arith.constant 0.000000e+00 : f32
%48 = vector.extract %47[0] : f32 from vector<1xf32>
%49 = gpu.subgroup_reduce add %48 cluster(size = 8) : (f32) -> f32
%50 = vector.insert %49, %cst_0 [0] : f32 into vector<1xf32>
%51 = vector.shape_cast %50 : vector<1xf32> to vector<1xf32>
%52 = vector.broadcast %cst_3 : f32 to vector<1xf32>
#include <hip/hip_runtime.h>
#include <hip/hip_bf16.h>
#include <iostream>
#include <vector>
#include <random>
#define HIP_CHECK(err) hipAssert(err, __FILE__, __LINE__)
inline void hipAssert(hipError_t err, const char* file, int line) {
if (err != hipSuccess) {
std::cerr << "HIP error: " << hipGetErrorString(err) << " at " << file << ":" << line << std::endl;