Skip to content

Instantly share code, notes, and snippets.

@embg
Created July 10, 2024 15:32
Show Gist options
  • Save embg/9fcbf39f14e1712bc6d500ff71ffa67d to your computer and use it in GitHub Desktop.
Save embg/9fcbf39f14e1712bc6d500ff71ffa67d to your computer and use it in GitHub Desktop.
ptxas segfault repro
//
// Generated by LLVM NVPTX Back-End
//
.version 8.4
.target sm_90a
.address_size 64
// .globl _copy_2d_tma_kernel
.extern .shared .align 16 .b8 global_smem[];
.visible .entry _copy_2d_tma_kernel(
.param .u64 _copy_2d_tma_kernel_param_0,
.param .u64 _copy_2d_tma_kernel_param_1,
.param .u64 _copy_2d_tma_kernel_param_2,
.param .u64 _copy_2d_tma_kernel_param_3,
.param .u32 _copy_2d_tma_kernel_param_4,
.param .u32 _copy_2d_tma_kernel_param_5
)
.maxntid 128, 1, 1
{
.reg .pred %p<8>;
.reg .b32 %r<82>;
.reg .f32 %f<33>;
.reg .b64 %rd<3>;
.loc 1 5 0
$L__func_begin0:
.loc 1 5 0
ld.param.u64 %rd2, [_copy_2d_tma_kernel_param_1];
$L__tmp0:
.loc 1 10 4
// begin inline asm
mov.u32 %r1, %ctaid.x;
// end inline asm
ld.param.u32 %r14, [_copy_2d_tma_kernel_param_5];
.loc 1 11 4
add.s32 %r15, %r14, 63;
.loc 1 12 4
shr.s32 %r16, %r15, 31;
shr.u32 %r17, %r16, 26;
add.s32 %r18, %r15, %r17;
shr.s32 %r19, %r18, 6;
ld.param.u64 %rd1, [_copy_2d_tma_kernel_param_3];
.loc 1 13 4
div.s32 %r21, %r1, %r19;
mul.lo.s32 %r22, %r21, %r19;
sub.s32 %r23, %r1, %r22;
.loc 1 15 4
shl.b32 %r6, %r21, 6;
.loc 1 16 4
shl.b32 %r24, %r23, 6;
.loc 1 19 4
mov.u32 %r25, %tid.x;
setp.eq.s32 %p1, %r25, 0;
mov.u32 %r26, global_smem;
add.s32 %r2, %r26, 16384;
// begin inline asm
@%p1 mbarrier.init.shared::cta.b64 [%r2], 1;
// end inline asm
.loc 1 20 4
bar.sync 0;
// begin inline asm
@%p1 mbarrier.arrive.expect_tx.shared.b64 _, [%r2], 16384;
// end inline asm
.loc 1 21 4
bar.sync 0;
shr.u32 %r27, %r25, 5;
shfl.sync.idx.b32 %r28, %r27, 0, 31, -1;
setp.lt.u32 %p7, %r25, 64;
// begin inline asm
@%p7 elect.sync _|%p3, 0xffffffff;
@!%p7 setp.lt.u32 %p3, %r25, 1024;
// end inline asm
and.pred %p4, %p7, %p3;
shl.b32 %r29, %r28, 13;
add.s32 %r4, %r26, %r29;
shl.b32 %r30, %r28, 5;
add.s32 %r5, %r30, %r24;
// begin inline asm
@%p4 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [%r4], [%rd1, {%r5, %r6}], [%r2];
// end inline asm
.loc 1 22 4
bar.sync 0;
mov.b32 %r9, 0;
// begin inline asm
{
.reg .pred P1;
waitLoop:
mbarrier.try_wait.parity.shared.b64 P1, [%r2], %r9;
@!P1 bra.uni waitLoop;
}
// end inline asm
.loc 1 23 4
bar.sync 0;
// begin inline asm
@%p1 mbarrier.inval.shared::cta.b64 [%r2];
// end inline asm
.loc 1 24 4
and.b32 %r31, %r25, 8;
and.b32 %r32, %r25, 16;
and.b32 %r33, %r25, 31;
bfe.s32 %r34, %r25, 6, 1;
and.b32 %r35, %r34, 36;
xor.b32 %r36, %r35, %r33;
shl.b32 %r37, %r25, 6;
and.b32 %r38, %r37, 2048;
or.b32 %r39, %r36, %r38;
shl.b32 %r40, %r39, 2;
add.s32 %r41, %r26, %r40;
ld.shared.f32 %f1, [%r41];
and.b32 %r42, %r25, 7;
or.b32 %r43, %r42, 8;
xor.b32 %r44, %r43, %r31;
or.b32 %r45, %r44, %r32;
xor.b32 %r46, %r45, %r35;
or.b32 %r47, %r46, %r38;
shl.b32 %r48, %r47, 2;
or.b32 %r49, %r48, 256;
add.s32 %r50, %r26, %r49;
ld.shared.f32 %f2, [%r50];
and.b32 %r51, %r25, 15;
or.b32 %r52, %r51, 16;
or.b32 %r53, %r35, %r32;
xor.b32 %r54, %r53, %r52;
or.b32 %r55, %r54, %r38;
shl.b32 %r56, %r55, 2;
add.s32 %r57, %r26, %r56;
ld.shared.f32 %f3, [%r57+512];
or.b32 %r58, %r42, 24;
and.b32 %r59, %r25, 24;
xor.b32 %r60, %r58, %r59;
xor.b32 %r61, %r60, %r35;
or.b32 %r62, %r61, %r38;
shl.b32 %r63, %r62, 2;
add.s32 %r64, %r26, %r63;
ld.shared.f32 %f4, [%r64+768];
ld.shared.f32 %f5, [%r41+1024];
or.b32 %r65, %r48, 1280;
add.s32 %r66, %r26, %r65;
ld.shared.f32 %f6, [%r66];
ld.shared.f32 %f7, [%r57+1536];
ld.shared.f32 %f8, [%r64+1792];
ld.shared.f32 %f9, [%r41+2048];
or.b32 %r67, %r48, 2304;
add.s32 %r68, %r26, %r67;
ld.shared.f32 %f10, [%r68];
ld.shared.f32 %f11, [%r57+2560];
ld.shared.f32 %f12, [%r64+2816];
ld.shared.f32 %f13, [%r41+3072];
or.b32 %r69, %r48, 3328;
add.s32 %r70, %r26, %r69;
ld.shared.f32 %f14, [%r70];
ld.shared.f32 %f15, [%r57+3584];
ld.shared.f32 %f16, [%r64+3840];
ld.shared.f32 %f17, [%r41+4096];
or.b32 %r71, %r48, 4352;
add.s32 %r72, %r26, %r71;
ld.shared.f32 %f18, [%r72];
ld.shared.f32 %f19, [%r57+4608];
ld.shared.f32 %f20, [%r64+4864];
ld.shared.f32 %f21, [%r41+5120];
or.b32 %r73, %r48, 5376;
add.s32 %r74, %r26, %r73;
ld.shared.f32 %f22, [%r74];
ld.shared.f32 %f23, [%r57+5632];
ld.shared.f32 %f24, [%r64+5888];
ld.shared.f32 %f25, [%r41+6144];
or.b32 %r75, %r48, 6400;
add.s32 %r76, %r26, %r75;
ld.shared.f32 %f26, [%r76];
ld.shared.f32 %f27, [%r57+6656];
ld.shared.f32 %f28, [%r64+6912];
ld.shared.f32 %f29, [%r41+7168];
or.b32 %r77, %r48, 7424;
add.s32 %r78, %r26, %r77;
ld.shared.f32 %f30, [%r78];
ld.shared.f32 %f31, [%r57+7680];
ld.shared.f32 %f32, [%r64+7936];
.loc 1 25 4
bar.sync 0;
st.shared.f32 [%r41], %f1;
st.shared.f32 [%r50], %f2;
st.shared.f32 [%r57+512], %f3;
st.shared.f32 [%r64+768], %f4;
st.shared.f32 [%r41+1024], %f5;
st.shared.f32 [%r66], %f6;
st.shared.f32 [%r57+1536], %f7;
st.shared.f32 [%r64+1792], %f8;
st.shared.f32 [%r41+2048], %f9;
st.shared.f32 [%r68], %f10;
st.shared.f32 [%r57+2560], %f11;
st.shared.f32 [%r64+2816], %f12;
st.shared.f32 [%r41+3072], %f13;
st.shared.f32 [%r70], %f14;
st.shared.f32 [%r57+3584], %f15;
st.shared.f32 [%r64+3840], %f16;
st.shared.f32 [%r41+4096], %f17;
st.shared.f32 [%r72], %f18;
st.shared.f32 [%r57+4608], %f19;
st.shared.f32 [%r64+4864], %f20;
st.shared.f32 [%r41+5120], %f21;
st.shared.f32 [%r74], %f22;
st.shared.f32 [%r57+5632], %f23;
st.shared.f32 [%r64+5888], %f24;
st.shared.f32 [%r41+6144], %f25;
st.shared.f32 [%r76], %f26;
st.shared.f32 [%r57+6656], %f27;
st.shared.f32 [%r64+6912], %f28;
st.shared.f32 [%r41+7168], %f29;
st.shared.f32 [%r78], %f30;
st.shared.f32 [%r57+7680], %f31;
st.shared.f32 [%r64+7936], %f32;
.loc 1 26 4
// begin inline asm
fence.proxy.async.shared::cta;
// end inline asm
.loc 1 27 4
bar.sync 0;
shfl.sync.idx.b32 %r79, %r27, 0, 31, -1;
shl.b32 %r80, %r79, 13;
add.s32 %r13, %r26, %r80;
shl.b32 %r81, %r79, 5;
add.s32 %r11, %r81, %r24;
// begin inline asm
@%p4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%rd2, {%r11, %r6}], [%r13];
// end inline asm
// begin inline asm
cp.async.bulk.commit_group ;
// end inline asm
.loc 1 28 4
// begin inline asm
cp.async.bulk.wait_group.read 0x0;
// end inline asm
.loc 1 29 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/home/embg/.triton/cache/897ddbc8f4d794d3272ef396aec2ea01c5e3bfb6d23c2c9cc9d5997e277dc715/_copy_2d_tma_kernel.ttgir"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 0
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 153
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 95
.b8 99
.b8 111
.b8 112
.b8 121
.b8 95
.b8 50
.b8 100
.b8 95
.b8 116
.b8 109
.b8 97
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 46
.b8 116
.b8 116
.b8 103
.b8 105
.b8 114
.b8 0
.b32 .debug_line
.b8 47
.b8 104
.b8 111
.b8 109
.b8 101
.b8 47
.b8 101
.b8 109
.b8 98
.b8 103
.b8 47
.b8 46
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 47
.b8 99
.b8 97
.b8 99
.b8 104
.b8 101
.b8 47
.b8 56
.b8 57
.b8 55
.b8 100
.b8 100
.b8 98
.b8 99
.b8 56
.b8 102
.b8 52
.b8 100
.b8 55
.b8 57
.b8 52
.b8 100
.b8 51
.b8 50
.b8 55
.b8 50
.b8 101
.b8 102
.b8 51
.b8 57
.b8 54
.b8 97
.b8 101
.b8 99
.b8 50
.b8 101
.b8 97
.b8 48
.b8 49
.b8 99
.b8 53
.b8 101
.b8 51
.b8 98
.b8 102
.b8 98
.b8 54
.b8 100
.b8 50
.b8 51
.b8 99
.b8 50
.b8 99
.b8 57
.b8 99
.b8 99
.b8 57
.b8 100
.b8 53
.b8 57
.b8 57
.b8 55
.b8 101
.b8 50
.b8 55
.b8 55
.b8 100
.b8 99
.b8 55
.b8 49
.b8 53
.b8 0
.b64 $L__func_begin0
.b64 $L__func_end0
}
.section .debug_loc { }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment