Created
July 10, 2024 15:32
-
-
Save embg/9fcbf39f14e1712bc6d500ff71ffa67d to your computer and use it in GitHub Desktop.
ptxas segfault repro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.4 | |
.target sm_90a | |
.address_size 64 | |
// .globl _copy_2d_tma_kernel | |
.extern .shared .align 16 .b8 global_smem[]; | |
.visible .entry _copy_2d_tma_kernel( | |
.param .u64 _copy_2d_tma_kernel_param_0, | |
.param .u64 _copy_2d_tma_kernel_param_1, | |
.param .u64 _copy_2d_tma_kernel_param_2, | |
.param .u64 _copy_2d_tma_kernel_param_3, | |
.param .u32 _copy_2d_tma_kernel_param_4, | |
.param .u32 _copy_2d_tma_kernel_param_5 | |
) | |
.maxntid 128, 1, 1 | |
{ | |
.reg .pred %p<8>; | |
.reg .b32 %r<82>; | |
.reg .f32 %f<33>; | |
.reg .b64 %rd<3>; | |
.loc 1 5 0 | |
$L__func_begin0: | |
.loc 1 5 0 | |
ld.param.u64 %rd2, [_copy_2d_tma_kernel_param_1]; | |
$L__tmp0: | |
.loc 1 10 4 | |
// begin inline asm | |
mov.u32 %r1, %ctaid.x; | |
// end inline asm | |
ld.param.u32 %r14, [_copy_2d_tma_kernel_param_5]; | |
.loc 1 11 4 | |
add.s32 %r15, %r14, 63; | |
.loc 1 12 4 | |
shr.s32 %r16, %r15, 31; | |
shr.u32 %r17, %r16, 26; | |
add.s32 %r18, %r15, %r17; | |
shr.s32 %r19, %r18, 6; | |
ld.param.u64 %rd1, [_copy_2d_tma_kernel_param_3]; | |
.loc 1 13 4 | |
div.s32 %r21, %r1, %r19; | |
mul.lo.s32 %r22, %r21, %r19; | |
sub.s32 %r23, %r1, %r22; | |
.loc 1 15 4 | |
shl.b32 %r6, %r21, 6; | |
.loc 1 16 4 | |
shl.b32 %r24, %r23, 6; | |
.loc 1 19 4 | |
mov.u32 %r25, %tid.x; | |
setp.eq.s32 %p1, %r25, 0; | |
mov.u32 %r26, global_smem; | |
add.s32 %r2, %r26, 16384; | |
// begin inline asm | |
@%p1 mbarrier.init.shared::cta.b64 [%r2], 1; | |
// end inline asm | |
.loc 1 20 4 | |
bar.sync 0; | |
// begin inline asm | |
@%p1 mbarrier.arrive.expect_tx.shared.b64 _, [%r2], 16384; | |
// end inline asm | |
.loc 1 21 4 | |
bar.sync 0; | |
shr.u32 %r27, %r25, 5; | |
shfl.sync.idx.b32 %r28, %r27, 0, 31, -1; | |
setp.lt.u32 %p7, %r25, 64; | |
// begin inline asm | |
@%p7 elect.sync _|%p3, 0xffffffff; | |
@!%p7 setp.lt.u32 %p3, %r25, 1024; | |
// end inline asm | |
and.pred %p4, %p7, %p3; | |
shl.b32 %r29, %r28, 13; | |
add.s32 %r4, %r26, %r29; | |
shl.b32 %r30, %r28, 5; | |
add.s32 %r5, %r30, %r24; | |
// begin inline asm | |
@%p4 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r4], [%rd1, {%r5, %r6}], [%r2]; | |
// end inline asm | |
.loc 1 22 4 | |
bar.sync 0; | |
mov.b32 %r9, 0; | |
// begin inline asm | |
{ | |
.reg .pred P1; | |
waitLoop: | |
mbarrier.try_wait.parity.shared.b64 P1, [%r2], %r9; | |
@!P1 bra.uni waitLoop; | |
} | |
// end inline asm | |
.loc 1 23 4 | |
bar.sync 0; | |
// begin inline asm | |
@%p1 mbarrier.inval.shared::cta.b64 [%r2]; | |
// end inline asm | |
.loc 1 24 4 | |
and.b32 %r31, %r25, 8; | |
and.b32 %r32, %r25, 16; | |
and.b32 %r33, %r25, 31; | |
bfe.s32 %r34, %r25, 6, 1; | |
and.b32 %r35, %r34, 36; | |
xor.b32 %r36, %r35, %r33; | |
shl.b32 %r37, %r25, 6; | |
and.b32 %r38, %r37, 2048; | |
or.b32 %r39, %r36, %r38; | |
shl.b32 %r40, %r39, 2; | |
add.s32 %r41, %r26, %r40; | |
ld.shared.f32 %f1, [%r41]; | |
and.b32 %r42, %r25, 7; | |
or.b32 %r43, %r42, 8; | |
xor.b32 %r44, %r43, %r31; | |
or.b32 %r45, %r44, %r32; | |
xor.b32 %r46, %r45, %r35; | |
or.b32 %r47, %r46, %r38; | |
shl.b32 %r48, %r47, 2; | |
or.b32 %r49, %r48, 256; | |
add.s32 %r50, %r26, %r49; | |
ld.shared.f32 %f2, [%r50]; | |
and.b32 %r51, %r25, 15; | |
or.b32 %r52, %r51, 16; | |
or.b32 %r53, %r35, %r32; | |
xor.b32 %r54, %r53, %r52; | |
or.b32 %r55, %r54, %r38; | |
shl.b32 %r56, %r55, 2; | |
add.s32 %r57, %r26, %r56; | |
ld.shared.f32 %f3, [%r57+512]; | |
or.b32 %r58, %r42, 24; | |
and.b32 %r59, %r25, 24; | |
xor.b32 %r60, %r58, %r59; | |
xor.b32 %r61, %r60, %r35; | |
or.b32 %r62, %r61, %r38; | |
shl.b32 %r63, %r62, 2; | |
add.s32 %r64, %r26, %r63; | |
ld.shared.f32 %f4, [%r64+768]; | |
ld.shared.f32 %f5, [%r41+1024]; | |
or.b32 %r65, %r48, 1280; | |
add.s32 %r66, %r26, %r65; | |
ld.shared.f32 %f6, [%r66]; | |
ld.shared.f32 %f7, [%r57+1536]; | |
ld.shared.f32 %f8, [%r64+1792]; | |
ld.shared.f32 %f9, [%r41+2048]; | |
or.b32 %r67, %r48, 2304; | |
add.s32 %r68, %r26, %r67; | |
ld.shared.f32 %f10, [%r68]; | |
ld.shared.f32 %f11, [%r57+2560]; | |
ld.shared.f32 %f12, [%r64+2816]; | |
ld.shared.f32 %f13, [%r41+3072]; | |
or.b32 %r69, %r48, 3328; | |
add.s32 %r70, %r26, %r69; | |
ld.shared.f32 %f14, [%r70]; | |
ld.shared.f32 %f15, [%r57+3584]; | |
ld.shared.f32 %f16, [%r64+3840]; | |
ld.shared.f32 %f17, [%r41+4096]; | |
or.b32 %r71, %r48, 4352; | |
add.s32 %r72, %r26, %r71; | |
ld.shared.f32 %f18, [%r72]; | |
ld.shared.f32 %f19, [%r57+4608]; | |
ld.shared.f32 %f20, [%r64+4864]; | |
ld.shared.f32 %f21, [%r41+5120]; | |
or.b32 %r73, %r48, 5376; | |
add.s32 %r74, %r26, %r73; | |
ld.shared.f32 %f22, [%r74]; | |
ld.shared.f32 %f23, [%r57+5632]; | |
ld.shared.f32 %f24, [%r64+5888]; | |
ld.shared.f32 %f25, [%r41+6144]; | |
or.b32 %r75, %r48, 6400; | |
add.s32 %r76, %r26, %r75; | |
ld.shared.f32 %f26, [%r76]; | |
ld.shared.f32 %f27, [%r57+6656]; | |
ld.shared.f32 %f28, [%r64+6912]; | |
ld.shared.f32 %f29, [%r41+7168]; | |
or.b32 %r77, %r48, 7424; | |
add.s32 %r78, %r26, %r77; | |
ld.shared.f32 %f30, [%r78]; | |
ld.shared.f32 %f31, [%r57+7680]; | |
ld.shared.f32 %f32, [%r64+7936]; | |
.loc 1 25 4 | |
bar.sync 0; | |
st.shared.f32 [%r41], %f1; | |
st.shared.f32 [%r50], %f2; | |
st.shared.f32 [%r57+512], %f3; | |
st.shared.f32 [%r64+768], %f4; | |
st.shared.f32 [%r41+1024], %f5; | |
st.shared.f32 [%r66], %f6; | |
st.shared.f32 [%r57+1536], %f7; | |
st.shared.f32 [%r64+1792], %f8; | |
st.shared.f32 [%r41+2048], %f9; | |
st.shared.f32 [%r68], %f10; | |
st.shared.f32 [%r57+2560], %f11; | |
st.shared.f32 [%r64+2816], %f12; | |
st.shared.f32 [%r41+3072], %f13; | |
st.shared.f32 [%r70], %f14; | |
st.shared.f32 [%r57+3584], %f15; | |
st.shared.f32 [%r64+3840], %f16; | |
st.shared.f32 [%r41+4096], %f17; | |
st.shared.f32 [%r72], %f18; | |
st.shared.f32 [%r57+4608], %f19; | |
st.shared.f32 [%r64+4864], %f20; | |
st.shared.f32 [%r41+5120], %f21; | |
st.shared.f32 [%r74], %f22; | |
st.shared.f32 [%r57+5632], %f23; | |
st.shared.f32 [%r64+5888], %f24; | |
st.shared.f32 [%r41+6144], %f25; | |
st.shared.f32 [%r76], %f26; | |
st.shared.f32 [%r57+6656], %f27; | |
st.shared.f32 [%r64+6912], %f28; | |
st.shared.f32 [%r41+7168], %f29; | |
st.shared.f32 [%r78], %f30; | |
st.shared.f32 [%r57+7680], %f31; | |
st.shared.f32 [%r64+7936], %f32; | |
.loc 1 26 4 | |
// begin inline asm | |
fence.proxy.async.shared::cta; | |
// end inline asm | |
.loc 1 27 4 | |
bar.sync 0; | |
shfl.sync.idx.b32 %r79, %r27, 0, 31, -1; | |
shl.b32 %r80, %r79, 13; | |
add.s32 %r13, %r26, %r80; | |
shl.b32 %r81, %r79, 5; | |
add.s32 %r11, %r81, %r24; | |
// begin inline asm | |
@%p4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%rd2, {%r11, %r6}], [%r13]; | |
// end inline asm | |
// begin inline asm | |
cp.async.bulk.commit_group ; | |
// end inline asm | |
.loc 1 28 4 | |
// begin inline asm | |
cp.async.bulk.wait_group.read 0x0; | |
// end inline asm | |
.loc 1 29 4 | |
ret; | |
$L__tmp1: | |
$L__func_end0: | |
} | |
.file 1 "/home/embg/.triton/cache/897ddbc8f4d794d3272ef396aec2ea01c5e3bfb6d23c2c9cc9d5997e277dc715/_copy_2d_tma_kernel.ttgir" | |
.section .debug_abbrev | |
{ | |
.b8 1 | |
.b8 17 | |
.b8 0 | |
.b8 37 | |
.b8 8 | |
.b8 19 | |
.b8 5 | |
.b8 3 | |
.b8 8 | |
.b8 16 | |
.b8 6 | |
.b8 27 | |
.b8 8 | |
.b8 17 | |
.b8 1 | |
.b8 18 | |
.b8 1 | |
.b8 0 | |
.b8 0 | |
.b8 0 | |
} | |
.section .debug_info | |
{ | |
.b32 153 | |
.b8 2 | |
.b8 0 | |
.b32 .debug_abbrev | |
.b8 8 | |
.b8 1 | |
.b8 116 | |
.b8 114 | |
.b8 105 | |
.b8 116 | |
.b8 111 | |
.b8 110 | |
.b8 0 | |
.b8 2 | |
.b8 0 | |
.b8 95 | |
.b8 99 | |
.b8 111 | |
.b8 112 | |
.b8 121 | |
.b8 95 | |
.b8 50 | |
.b8 100 | |
.b8 95 | |
.b8 116 | |
.b8 109 | |
.b8 97 | |
.b8 95 | |
.b8 107 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 46 | |
.b8 116 | |
.b8 116 | |
.b8 103 | |
.b8 105 | |
.b8 114 | |
.b8 0 | |
.b32 .debug_line | |
.b8 47 | |
.b8 104 | |
.b8 111 | |
.b8 109 | |
.b8 101 | |
.b8 47 | |
.b8 101 | |
.b8 109 | |
.b8 98 | |
.b8 103 | |
.b8 47 | |
.b8 46 | |
.b8 116 | |
.b8 114 | |
.b8 105 | |
.b8 116 | |
.b8 111 | |
.b8 110 | |
.b8 47 | |
.b8 99 | |
.b8 97 | |
.b8 99 | |
.b8 104 | |
.b8 101 | |
.b8 47 | |
.b8 56 | |
.b8 57 | |
.b8 55 | |
.b8 100 | |
.b8 100 | |
.b8 98 | |
.b8 99 | |
.b8 56 | |
.b8 102 | |
.b8 52 | |
.b8 100 | |
.b8 55 | |
.b8 57 | |
.b8 52 | |
.b8 100 | |
.b8 51 | |
.b8 50 | |
.b8 55 | |
.b8 50 | |
.b8 101 | |
.b8 102 | |
.b8 51 | |
.b8 57 | |
.b8 54 | |
.b8 97 | |
.b8 101 | |
.b8 99 | |
.b8 50 | |
.b8 101 | |
.b8 97 | |
.b8 48 | |
.b8 49 | |
.b8 99 | |
.b8 53 | |
.b8 101 | |
.b8 51 | |
.b8 98 | |
.b8 102 | |
.b8 98 | |
.b8 54 | |
.b8 100 | |
.b8 50 | |
.b8 51 | |
.b8 99 | |
.b8 50 | |
.b8 99 | |
.b8 57 | |
.b8 99 | |
.b8 99 | |
.b8 57 | |
.b8 100 | |
.b8 53 | |
.b8 57 | |
.b8 57 | |
.b8 55 | |
.b8 101 | |
.b8 50 | |
.b8 55 | |
.b8 55 | |
.b8 100 | |
.b8 99 | |
.b8 55 | |
.b8 49 | |
.b8 53 | |
.b8 0 | |
.b64 $L__func_begin0 | |
.b64 $L__func_end0 | |
} | |
.section .debug_loc { } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment