Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
---------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------------------------------------------
BM__4_14336_4096/process_time/real_time 0.517 ms 0.524 ms 1353 items_per_second=1.93239k/s
BM__4_14336_4096/process_time/real_time 0.517 ms 0.524 ms 1353 items_per_second=1.93329k/s
BM__4_14336_4096/process_time/real_time 0.517 ms 0.524 ms 1353 items_per_second=1.93427k/s
BM__4_14336_4096/process_time/real_time 0.517 ms 0.524 ms 1353 items_per_second=1.93414k/s
BM__4_14336_4096/process_time/real_time 0.517 ms 0.524 ms 1353 items_per_second=1.93428k/s
BM__4_14336_4096/process_time/real_time_mean 0.517 ms 0.524 ms 5 items_per_second=1.93368k/
def generate_mlir(m, n, k):
# Define the MLIR types
matA_type = f"tensor<{m}x{k}xf16>"
matB_type = f"tensor<{n}x{k}xf16>"
matCF32_type = f"tensor<{m}x{n}xf32>"
file_name = f"file_{m}_{n}_{k}.mlir"
# Generate the MLIR function
mlir_code = f"""
This file has been truncated, but you can view the full file.
// -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
%int6 = torch.constant.int 6
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
%int2 = torch.constant.int 2
%int3 = torch.constant.int 3
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
%int0 = torch.constant.int 0
%true = torch.constant.bool true
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
func.func @matvec_dispatch_0_matmul_transpose_b_32000x1x4096_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x1x512xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<16x1xf32>
%c4096 = arith.constant 4096 : index
%c0 = arith.constant 0 : index
%c512 = arith.constant 512 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> to memref<32000x4096xf16, #amdgp
// -----// IR Dump After CSE (cse) //----- //
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x16xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x4xf16>
%c0 = arith.constant 0 : index
%cst_2 = arith.constant 1.638400e+05 : f32
%cst_3 = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c40 = arith.constant 40 : index
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgrou
/home/prashant/iree/.venv/bin/iree-compile --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-preprocessing-pass-pipeline="builtin.module(util.func(iree-global-opt-raise-special-ops, iree-flow-canonicalize), iree-preprocessing-transpose-convolution-pipeline, iree-preprocessing-pad-to-intrinsics, util.func(iree-preprocessing-generalize-linalg-matmul-experimental))" --iree-hal-dump-executable-files-to=dump/ --iree-dispatch-creation-enable-aggressive-fusion --iree-dispatch-creation-enable-fuse-horizontal-contractions=false --iree-opt-aggressively-propagate-transposes=true --iree-codegen-llvmgpu-use-vector-distribution=true --iree-opt-data-tiling=false --iree-vm-target-truncate-unsupported-floats --iree-opt-outer-dim-concat=true --iree-codegen-gpu-native-math-precision=true --iree-hal-indirect-command-buffers=true --iree-stream-resource-memory-model=discrete --iree-hal-memoization=true --iree-opt-strip-assertions --iree-global-opt-propagate-tr
hal.executable public @main$async_dispatch_140 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [214
import numpy as np
def attention(Q, K, V):
"""
Computes attention: softmax(QK^T)V
Args:
Q: Query matrix of shape (batch_size, seq_len_q, d)
K: Key matrix of shape (batch_size, seq_len_k, d)
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1, d0)>
module {
func.func @matmul_add_transpose(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32> {
%0 = tensor.empty() : tensor<4x4xf32>
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%1 = tensor.empty() : tensor<4x4xf32>
%2 = scf.for %arg2 = %c0 to %c4 step %c2 iter_args(%arg3 = %1) -> (tensor<4x4xf32>) {