Skip to content

Instantly share code, notes, and snippets.

@Artem-B
Created July 21, 2025 19:49
Show Gist options
  • Save Artem-B/0fd96044926baa90ea83180f9a00c1b4 to your computer and use it in GitHub Desktop.
Save Artem-B/0fd96044926baa90ea83180f9a00c1b4 to your computer and use it in GitHub Desktop.
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
@global_smem = external addrspace(3) global [0 x i8], align 16
define ptx_kernel void @kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) readnone captures(none) %2) local_unnamed_addr #0 !dbg !6 {
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
%5 = and i32 %4, 31, !dbg !9
%6 = lshr i32 %4, 5, !dbg !9
%7 = shl nuw nsw i32 %4, 2, !dbg !9
%8 = and i32 %7, 508, !dbg !9
%9 = zext nneg i32 %8 to i64, !dbg !10
%10 = getelementptr i8, ptr addrspace(1) %0, i64 %9, !dbg !10
%11 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %10) #5, !dbg !11
%12 = bitcast i32 %11 to <4 x i8>, !dbg !11
%13 = tail call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %12), !dbg !12
%14 = sext i8 %13 to i32, !dbg !12
%15 = tail call i32 @llvm.nvvm.redux.sync.min(i32 %14, i32 -1), !dbg !16
%16 = and i32 %6, 3, !dbg !16
%17 = icmp eq i32 %5, 0, !dbg !16
%18 = zext nneg i32 %16 to i64, !dbg !16
%19 = getelementptr i32, ptr addrspace(3) @global_smem, i64 %18, !dbg !16
%20 = insertelement <1 x i32> poison, i32 %15, i64 0, !dbg !16
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %19, <1 x i32> %20, i1 %17) #5, !dbg !16
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !16
%21 = icmp samesign ult i32 %4, 4, !dbg !16
%22 = zext nneg i32 %4 to i64, !dbg !16
%23 = getelementptr i32, ptr addrspace(3) @global_smem, i64 %22, !dbg !16
%24 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %23, i1 %21) #5, !dbg !16
%25 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %24, i32 2, i32 31), !dbg !16
%26 = tail call i32 @llvm.smin.i32(i32 %24, i32 %25), !dbg !12
%27 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %26, i32 1, i32 31), !dbg !16
%28 = tail call i32 @llvm.smin.i32(i32 %26, i32 %27), !dbg !12
%29 = icmp eq i32 %4, 0, !dbg !16
%30 = insertelement <1 x i32> poison, i32 %28, i64 0, !dbg !16
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %23, <1 x i32> %30, i1 %29) #5, !dbg !16
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !16
%31 = load i32, ptr addrspace(3) @global_smem, align 16, !dbg !16
%32 = trunc i32 %31 to i8, !dbg !17
tail call void asm sideeffect "@$2 st.global.b8 [ $1 + 0 ], { $0 };", "c,l,b"(i8 %32, ptr addrspace(1) %1, i1 %29) #5, !dbg !17
ret void, !dbg !18
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.smin.i32(i32, i32) #1
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.redux.sync.min(i32, i32) #2
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) #4
attributes #0 = { "nvvm.reqntid"="128" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #3 = { convergent nocallback nounwind }
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #5 = { nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2, !3}
!llvm.ident = !{!4}
!nvvmir.version = !{!5}
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
!1 = !DIFile(filename: "test_core.py", directory: "/usr/local/google/_blaze_tra/055757613a7bee1fd9750a4a19b01463/execroot/google3/blaze-out/k8-fastbuild-cuda/bin/third_party/triton/python/test/unit/language/test_core_h100.runfiles/google3/third_party/triton/python/test/unit/language")
!2 = !{i32 2, !"Debug Info Version", i32 3}
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!5 = !{i32 2, i32 0}
!6 = distinct !DISubprogram(name: "kernel", linkageName: "kernel", scope: !1, file: !1, line: 2421, type: !7, scopeLine: 2421, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
!7 = !DISubroutineType(cc: DW_CC_normal, types: !8)
!8 = !{}
!9 = !DILocation(line: 2422, column: 33, scope: !6)
!10 = !DILocation(line: 2422, column: 20, scope: !6)
!11 = !DILocation(line: 2422, column: 16, scope: !6)
!12 = !DILocation(line: 226, column: 27, scope: !13, inlinedAt: !15)
!13 = distinct !DILexicalBlockFile(scope: !6, file: !14, discriminator: 0)
!14 = !DIFile(filename: "standard.py", directory: "/usr/local/google/_blaze_tra/055757613a7bee1fd9750a4a19b01463/execroot/google3/blaze-out/k8-fastbuild-cuda/bin/third_party/triton/python/test/unit/language/test_core_h100.runfiles/google3/third_party/py/triton/language")
!15 = !DILocation(line: 2423, column: 15, scope: !6)
!16 = !DILocation(line: 247, column: 40, scope: !13, inlinedAt: !15)
!17 = !DILocation(line: 2424, column: 16, scope: !6)
!18 = !DILocation(line: 2424, column: 4, scope: !6)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment