Skip to content

Instantly share code, notes, and snippets.

View makslevental's full-sized avatar
💩

Maksim Levental makslevental

💩
View GitHub Profile
@makslevental
makslevental / triton.mlir
Created March 5, 2025 14:22
Triton + Linalg
BEFORE
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
module attributes {transform.target_tag = "payload"} {
tt.func public @matmul_kernel_2(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {noinline = false} {
%c64_i32 = arith.constant 64 : i32
%c64_i32_0 = arith.constant 64 : i32
%c64_i32_1 = arith.constant 64 : i32
%c1_i32 = arith.constant 1 : i32
import numpy as np
from triton_mlir.extras.context import RAIIMLIRContextModule
from triton_mlir.dialects import tt as ttpp, scf, llvm, _tt_ops_gen as tt
from triton_mlir.ir import Attribute, ArrayAttr, TypeAttr, Type
from triton_mlir.extras.dialects.ext import arith
ctx = RAIIMLIRContextModule()
@ttpp.jit(arg_attrs=ArrayAttr.parse('[{tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}]'), function_type=TypeAttr.parse('(!tt.ptr<f16>, !tt.ptr<f16>, !tt.ptr<f16>, i32, i32, i32, i32, i32, i32) -> ()'), noinline=False, sym_name='matmul_kernel', sym_visibility='public')
def matmul_kernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
MLIR_CAPI_EXPORTED MlirAttribute mlirCallSiteLocAttrGet(Location callee, Location caller, MlirContext mlirContext);
MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcallee(MlirAttribute mlirCallSiteLoc);
MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcaller(MlirAttribute mlirCallSiteLoc);
MLIR_CAPI_EXPORTED MlirAttribute mlirFileLineColRangeAttrGet(StringAttr filename, unsigned start_line, unsigned start_column, unsigned end_line, unsigned end_column, MlirContext mlirContext);
MLIR_CAPI_EXPORTED StringAttr mlirFileLineColRangeGetfilename(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_line(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_column(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_line(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_column(MlirAttribute mlirFileLineColRange);
MLIR_CAPI_EXPORTED MlirAttribute mlirFusedLocAttrGe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de6ed2393..db776c0fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,8 @@ if (TRITON_PARALLEL_LINK_JOBS)
set(CMAKE_JOB_POOL_LINK link_job_pool)
endif()
+string(REPLACE "-Wl,-z,defs", "" CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS})
+string(REPLACE "-Wl,-z,defs", "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
source_filename = "LLVMDialectModule"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
@global_smem = external addrspace(3) global [0 x i8], align 16
; Function Attrs: alwaysinline nofree norecurse nounwind
define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg writeonly %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, ptr addrspace(1) inreg readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 {
%11 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !7
%12 = add i32 %4, 255, !dbg !8
{
"artifacts" :
[
{
"path" : "bin/triton-opt"
}
],
"backtrace" : 2,
"backtraceGraph" :
{
{
"artifacts" :
[
{
"path" : "/home/mlevental/dev_projects/eudsl/projects/eudsl-llvmpy/src/llvm/eudslllvm_ext.cpython-310-x86_64-linux-gnu.so"
}
],
"backtrace" : 2,
"backtraceGraph" :
{
# -DCMAKE_INSTALL_PREFIX=/home/mlevental/dev_projects/iree/iree-install
#-DIREE_BUILD_PYTHON_BINDINGS=ON
#-DIREE_ENABLE_ASSERTIONS=ON
#-DCMAKE_C_COMPILER=clang-18
#-DCMAKE_CXX_COMPILER=clang++-18
#-DIREE_ENABLE_LLD=ON
#-DCMAKE_C_COMPILER_LAUNCHER=ccache
#-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
#-DMLIR_ENABLE_BINDINGS_PYTHON=ON
#-DIREE_ENABLE_RUNTIME_TRACING=ON
void ExtractSliceOp::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::Type result, ::mlir::Value source, ::mlir::DenseI64ArrayAttr static_offsets) {
odsState.addOperands(source);
odsState.getOrAddProperties<Properties>().static_offsets = static_offsets;
odsState.addTypes(result);
}
void ExtractSliceOp::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::TypeRange resultTypes, ::mlir::Value source, ::mlir::DenseI64ArrayAttr static_offsets) {
odsState.addOperands(source);
odsState.getOrAddProperties<Properties>().static_offsets = static_offsets;
assert(resultTypes.size() == 1u && "mismatched number of results");
Index: third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
--- a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp (revision f5e11cc519ea223de2c74c3c14a32f0beb327fae)
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp (date 1737041209117)
@@ -451,6 +451,19 @@
llvm::SetVector<Operation *> &opToRewrite;