April 1, 2025 18:29 · April 1, 2025 16:19 · March 26, 2025 01:12 · March 21, 2025 07:54 · March 7, 2025 15:46 · March 5, 2025 14:22
 ; ModuleID = '/home/mlevental/dev_projects/mlir-python-extras/examples/llvm.bc'
 source_filename = "LLVMDialectModule"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"

 @__dynamic_shmem__0 = internal unnamed_addr addrspace(3) global [0 x i8] undef

 ; Function Attrs: nofree norecurse nounwind
 define amdgpu_kernel void @kernel2_lds_shared0(ptr readonly captures(none) %0, ptr readonly captures(none) %1, ptr writeonly captures(none) %2) local_unnamed_addr #0 {
  %.global3 = addrspacecast ptr %1 to ptr addrspace(1)

 ; __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa--gfx1150
 ; ModuleID = 'src/kernel2_lds.cpp'
 source_filename = "src/kernel2_lds.cpp"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"

 @_ZZ11kernel2_ldsPKfS0_PfiE2As = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
 @_ZZ11kernel2_ldsPKfS0_PfiE2Bs = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
 @__hip_cuid_db26c5b7fc0b9bd = addrspace(1) global i8 0
 ============================ ROCm System Management Interface ============================
 ============================== Version of System Component ===============================
 Driver version: 6.10.5
 ==========================================================================================
 =========================================== ID ===========================================
 GPU[0]		: Device Name: 		Strix [Radeon 880M / 890M]
 GPU[0]		: Device ID: 		0x150e
 GPU[0]		: Device Rev: 		0xc1
 GPU[0]		: Subsystem ID: 	0x1df3
 GPU[0]		: GUID: 		39438
 import re

 import numpy as np

 build_times = open("log.bkup.txt").read()
 link_smt_times = open("log.link.smt.txt").read()
 link_no_smt_times = open("log.link.no.smt.txt").read()


 real_reg = re.compile(r"real\s+(.*?)s")

      %52 = arith.ceildivsi %arg5, %c64_i32_1 : i32
      %53 = scf.forall (%arg12, %arg13) in (4, 1) shared_outs(%arg14 = %47) -> (tensor<64x64x!tt.ptr<f32>>) {
        %72 = affine.apply #map(%arg12)
        %73 = affine.apply #map1(%arg13)
        %extracted_slice = tensor.extract_slice %36[%72, %73] [16, 64] [1, 1] 
        %extracted_slice_10 = tensor.extract_slice %arg14[%72, %73] [16, 64] [1, 1] 
      %74 = linalg.generic {
        indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]
      } ins(%extracted_slice : tensor<16x64x!tt.ptr<f32>>) outs(%extracted_slice_10 : tensor<16x64x!tt.ptr<f32>>) {
 BEFORE

 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
  module attributes {transform.target_tag = "payload"} {
    tt.func public @matmul_kernel_2(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {noinline = false} {
      %c64_i32 = arith.constant 64 : i32
      %c64_i32_0 = arith.constant 64 : i32
      %c64_i32_1 = arith.constant 64 : i32
      %c1_i32 = arith.constant 1 : i32
 import numpy as np
 from triton_mlir.extras.context import RAIIMLIRContextModule
 from triton_mlir.dialects import tt as ttpp, scf, llvm, _tt_ops_gen as tt
 from triton_mlir.ir import Attribute, ArrayAttr, TypeAttr, Type
 from triton_mlir.extras.dialects.ext import arith

 ctx = RAIIMLIRContextModule()

 @ttpp.jit(arg_attrs=ArrayAttr.parse('[{tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}]'), function_type=TypeAttr.parse('(!tt.ptr<f16>, !tt.ptr<f16>, !tt.ptr<f16>, i32, i32, i32, i32, i32, i32) -> ()'), noinline=False, sym_name='matmul_kernel', sym_visibility='public')
 def matmul_kernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
 MLIR_CAPI_EXPORTED MlirAttribute mlirCallSiteLocAttrGet(Location callee, Location caller, MlirContext mlirContext);
 MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcallee(MlirAttribute mlirCallSiteLoc);
 MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcaller(MlirAttribute mlirCallSiteLoc);
 MLIR_CAPI_EXPORTED MlirAttribute mlirFileLineColRangeAttrGet(StringAttr filename, unsigned start_line, unsigned start_column, unsigned end_line, unsigned end_column, MlirContext mlirContext);
 MLIR_CAPI_EXPORTED StringAttr mlirFileLineColRangeGetfilename(MlirAttribute mlirFileLineColRange);
 MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_line(MlirAttribute mlirFileLineColRange);
 MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_column(MlirAttribute mlirFileLineColRange);
 MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_line(MlirAttribute mlirFileLineColRange);
 MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_column(MlirAttribute mlirFileLineColRange);
 MLIR_CAPI_EXPORTED MlirAttribute mlirFusedLocAttrGe
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index de6ed2393..db776c0fc 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -47,6 +47,8 @@ if (TRITON_PARALLEL_LINK_JOBS)
     set(CMAKE_JOB_POOL_LINK link_job_pool)
 endif()
 
 +string(REPLACE "-Wl,-z,defs", "" CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS})
 +string(REPLACE "-Wl,-z,defs", "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})
 source_filename = "LLVMDialectModule"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"

 @global_smem = external addrspace(3) global [0 x i8], align 16

 ; Function Attrs: alwaysinline nofree norecurse nounwind
 define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg writeonly %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, ptr addrspace(1) inreg readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 {
  %11 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !7
  %12 = add i32 %4, 255, !dbg !8
	; ModuleID = '/home/mlevental/dev_projects/mlir-python-extras/examples/llvm.bc'
	source_filename = "LLVMDialectModule"
	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
	target triple = "amdgcn-amd-amdhsa"

	@__dynamic_shmem__0 = internal unnamed_addr addrspace(3) global [0 x i8] undef

	; Function Attrs: nofree norecurse nounwind
	define amdgpu_kernel void @kernel2_lds_shared0(ptr readonly captures(none) %0, ptr readonly captures(none) %1, ptr writeonly captures(none) %2) local_unnamed_addr #0 {
	%.global3 = addrspacecast ptr %1 to ptr addrspace(1)

	; __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa--gfx1150
	; ModuleID = 'src/kernel2_lds.cpp'
	source_filename = "src/kernel2_lds.cpp"
	target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
	target triple = "amdgcn-amd-amdhsa"

	@_ZZ11kernel2_ldsPKfS0_PfiE2As = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
	@_ZZ11kernel2_ldsPKfS0_PfiE2Bs = internal unnamed_addr addrspace(3) global [32 x [32 x float]] undef, align 16
	@__hip_cuid_db26c5b7fc0b9bd = addrspace(1) global i8 0
	============================ ROCm System Management Interface ============================
	============================== Version of System Component ===============================
	Driver version: 6.10.5
	==========================================================================================
	=========================================== ID ===========================================
	GPU[0] : Device Name: Strix [Radeon 880M / 890M]
	GPU[0] : Device ID: 0x150e
	GPU[0] : Device Rev: 0xc1
	GPU[0] : Subsystem ID: 0x1df3
	GPU[0] : GUID: 39438
	import re

	import numpy as np

	build_times = open("log.bkup.txt").read()
	link_smt_times = open("log.link.smt.txt").read()
	link_no_smt_times = open("log.link.no.smt.txt").read()


	real_reg = re.compile(r"real\s+(.*?)s")

	%52 = arith.ceildivsi %arg5, %c64_i32_1 : i32
	%53 = scf.forall (%arg12, %arg13) in (4, 1) shared_outs(%arg14 = %47) -> (tensor<64x64x!tt.ptr<f32>>) {
	%72 = affine.apply #map(%arg12)
	%73 = affine.apply #map1(%arg13)
	%extracted_slice = tensor.extract_slice %36[%72, %73] [16, 64] [1, 1]
	%extracted_slice_10 = tensor.extract_slice %arg14[%72, %73] [16, 64] [1, 1]
	%74 = linalg.generic {
	indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]
	} ins(%extracted_slice : tensor<16x64x!tt.ptr<f32>>) outs(%extracted_slice_10 : tensor<16x64x!tt.ptr<f32>>) {
	BEFORE

	#map = affine_map<(d0, d1) -> (d0, d1)>
	module {
	module attributes {transform.target_tag = "payload"} {
	tt.func public @matmul_kernel_2(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {noinline = false} {
	%c64_i32 = arith.constant 64 : i32
	%c64_i32_0 = arith.constant 64 : i32
	%c64_i32_1 = arith.constant 64 : i32
	%c1_i32 = arith.constant 1 : i32
	import numpy as np
	from triton_mlir.extras.context import RAIIMLIRContextModule
	from triton_mlir.dialects import tt as ttpp, scf, llvm, _tt_ops_gen as tt
	from triton_mlir.ir import Attribute, ArrayAttr, TypeAttr, Type
	from triton_mlir.extras.dialects.ext import arith

	ctx = RAIIMLIRContextModule()

	@ttpp.jit(arg_attrs=ArrayAttr.parse('[{tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}]'), function_type=TypeAttr.parse('(!tt.ptr<f16>, !tt.ptr<f16>, !tt.ptr<f16>, i32, i32, i32, i32, i32, i32) -> ()'), noinline=False, sym_name='matmul_kernel', sym_visibility='public')
	def matmul_kernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
	MLIR_CAPI_EXPORTED MlirAttribute mlirCallSiteLocAttrGet(Location callee, Location caller, MlirContext mlirContext);
	MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcallee(MlirAttribute mlirCallSiteLoc);
	MLIR_CAPI_EXPORTED Location mlirCallSiteLocGetcaller(MlirAttribute mlirCallSiteLoc);
	MLIR_CAPI_EXPORTED MlirAttribute mlirFileLineColRangeAttrGet(StringAttr filename, unsigned start_line, unsigned start_column, unsigned end_line, unsigned end_column, MlirContext mlirContext);
	MLIR_CAPI_EXPORTED StringAttr mlirFileLineColRangeGetfilename(MlirAttribute mlirFileLineColRange);
	MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_line(MlirAttribute mlirFileLineColRange);
	MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetstart_column(MlirAttribute mlirFileLineColRange);
	MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_line(MlirAttribute mlirFileLineColRange);
	MLIR_CAPI_EXPORTED unsigned mlirFileLineColRangeGetend_column(MlirAttribute mlirFileLineColRange);
	MLIR_CAPI_EXPORTED MlirAttribute mlirFusedLocAttrGe
	diff --git a/CMakeLists.txt b/CMakeLists.txt
	index de6ed2393..db776c0fc 100644
	--- a/CMakeLists.txt
	+++ b/CMakeLists.txt
	@@ -47,6 +47,8 @@ if (TRITON_PARALLEL_LINK_JOBS)
	set(CMAKE_JOB_POOL_LINK link_job_pool)
	endif()

	+string(REPLACE "-Wl,-z,defs", "" CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS})
	+string(REPLACE "-Wl,-z,defs", "" CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS})