Skip to content

Instantly share code, notes, and snippets.

View makslevental's full-sized avatar
💩

Maksim Levental makslevental

💩
View GitHub Profile
; ModuleID = '/home/mlevental/dev_projects/mlir-python-extras/examples/llvm.bc'
source_filename = "LLVMDialectModule"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
@__dynamic_shmem__0 = internal unnamed_addr addrspace(3) global [0 x i8] undef
; Function Attrs: nofree norecurse nounwind
define amdgpu_kernel void @kernel2_lds_shared0(ptr readonly captures(none) %0, ptr readonly captures(none) %1, ptr writeonly captures(none) %2) local_unnamed_addr #0 {
%.global3 = addrspacecast ptr %1 to ptr addrspace(1)
; __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa--gfx1150
; ModuleID = 'src/kernel2_lds.cpp'
source_filename = "src/kernel2_lds.cpp"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
@_ZZ11kernel2_ldsPKfS0_PfiE2As = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
@_ZZ11kernel2_ldsPKfS0_PfiE2Bs = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
@__hip_cuid_db26c5b7fc0b9bd = addrspace(1) global i8 0
============================ ROCm System Management Interface ============================
============================== Version of System Component ===============================
Driver version: 6.10.5
==========================================================================================
=========================================== ID ===========================================
GPU[0] : Device Name: Strix [Radeon 880M / 890M]
GPU[0] : Device ID: 0x150e
GPU[0] : Device Rev: 0xc1
GPU[0] : Subsystem ID: 0x1df3
GPU[0] : GUID: 39438
@makslevental
makslevental / 0compute_time.py
Last active March 21, 2025 07:54
timing smt
import re
import numpy as np
build_times = open("log.bkup.txt").read()
link_smt_times = open("log.link.smt.txt").read()
link_no_smt_times = open("log.link.no.smt.txt").read()
real_reg = re.compile(r"real\s+(.*?)s")
%52 = arith.ceildivsi %arg5, %c64_i32_1 : i32
%53 = scf.forall (%arg12, %arg13) in (4, 1) shared_outs(%arg14 = %47) -> (tensor<64x64x!tt.ptr<f32>>) {
%72 = affine.apply #map(%arg12)
%73 = affine.apply #map1(%arg13)
%extracted_slice = tensor.extract_slice %36[%72, %73] [16, 64] [1, 1]
%extracted_slice_10 = tensor.extract_slice %arg14[%72, %73] [16, 64] [1, 1]
%74 = linalg.generic {
indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]
} ins(%extracted_slice : tensor<16x64x!tt.ptr<f32>>) outs(%extracted_slice_10 : tensor<16x64x!tt.ptr<f32>>) {
@makslevental
makslevental / triton.mlir
Created March 5, 2025 14:22
Triton + Linalg
BEFORE
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
module attributes {transform.target_tag = "payload"} {
tt.func public @matmul_kernel_2(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {noinline = false} {
%c64_i32 = arith.constant 64 : i32
%c64_i32_0 = arith.constant 64 : i32
%c64_i32_1 = arith.constant 64 : i32
%c1_i32 = arith.constant 1 : i32
import numpy as np
from triton_mlir.extras.context import RAIIMLIRContextModule
from triton_mlir.dialects import tt as ttpp, scf, llvm, _tt_ops_gen as tt
from triton_mlir.ir import Attribute, ArrayAttr, TypeAttr, Type
from triton_mlir.extras.dialects.ext import arith
ctx = RAIIMLIRContextModule()
@ttpp.jit(arg_attrs=ArrayAttr.parse('[{tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}]'), function_type=TypeAttr.parse('(!tt.ptr<f16>, !tt.ptr<f16>, !tt.ptr<f16>, i32, i32, i32, i32, i32, i32) -> ()'), noinline=False, sym_name='matmul_kernel', sym_visibility='public')
def matmul_kernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
MLIR_CAPI_EXPORTED MlirAttribute mlirCallSiteLocAttrGet(Location callee, Location caller, MlirContext mlirContext);
MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcallee(MlirAttribute mlirCallSiteLoc);
MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcaller(MlirAttribute mlirCallSiteLoc);
MLIR_CAPI_EXPORTED MlirAttribute mlirFileLineColRangeAttrGet(StringAttr filename, unsigned start_line, unsigned start_column, unsigned end_line, unsigned end_column, MlirContext mlirContext);
MLIR_CAPI_EXPORTED StringAttr mlirFileLineColRangeGetfilename(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_line(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_column(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_line(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_column(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED MlirAttribute mlirFusedLocAttrGe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de6ed2393..db776c0fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,8 @@ if (TRITON_PARALLEL_LINK_JOBS)
set(CMAKE_JOB_POOL_LINK link_job_pool)
endif()
+string(REPLACE "-Wl,-z,defs", "" CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS})
+string(REPLACE "-Wl,-z,defs", "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
source_filename = "LLVMDialectModule"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
@global_smem = external addrspace(3) global [0 x i8], align 16
; Function Attrs: alwaysinline nofree norecurse nounwind
define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg writeonly %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, ptr addrspace(1) inreg readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 {
%11 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !7
%12 = add i32 %4, 255, !dbg !8