Skip to content

Instantly share code, notes, and snippets.

@HDCharles
Created July 27, 2023 20:20
Show Gist options
  • Select an option

  • Save HDCharles/d50b5da7365ea7f256a00c771e7fc0ec to your computer and use it in GitHub Desktop.

Select an option

Save HDCharles/d50b5da7365ea7f256a00c771e7fc0ec to your computer and use it in GitHub Desktop.
//
// Generated by LLVM NVPTX Back-End
//
.version 8.1
.target sm_80
.address_size 64
// .globl int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c
.extern .shared .align 1 .b8 global_smem[];
.visible .entry int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c(
.param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_0,
.param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_1,
.param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_2,
.param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_3,
.param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_4,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_5,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_6,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_7,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_8,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_9,
.param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_10
)
.maxntid 64, 1, 1
{
.reg .pred %p<23>;
.reg .b16 %rs<357>;
.reg .b32 %r<802>;
.reg .f32 %f<481>;
.reg .b64 %rd<89>;
.loc 1 167 0
$L__func_begin0:
.loc 1 167 0
ld.param.u32 %r122, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_10];
ld.param.u32 %r121, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_7];
ld.param.u32 %r120, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_6];
ld.param.u32 %r119, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_5];
ld.param.u64 %rd31, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_4];
ld.param.u64 %rd68, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_3];
ld.param.u64 %rd29, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_2];
ld.param.u64 %rd88, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_1];
ld.param.u64 %rd27, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_0];
$L__tmp0:
.loc 1 207 51
mov.u32 %r1, %tid.x;
and.b32 %r189, %r1, 31;
bfe.u32 %r2, %r1, 5, 1;
bfe.u32 %r3, %r1, 3, 2;
shl.b32 %r190, %r2, 2;
or.b32 %r4, %r190, %r3;
ld.param.u32 %r191, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_8];
.loc 1 208 51
bfe.u32 %r5, %r1, 2, 3;
ld.param.u32 %r192, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_9];
shl.b32 %r6, %r2, 3;
or.b32 %r193, %r6, %r5;
and.b32 %r194, %r1, 7;
shl.b32 %r7, %r194, 3;
.loc 1 211 29
and.b32 %r8, %r1, 3;
shl.b32 %r9, %r8, 4;
.loc 1 190 24
mov.u32 %r195, %ctaid.x;
$L__tmp1:
.loc 2 21 0
add.s32 %r196, %r119, 31;
.loc 2 21 28
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 27;
add.s32 %r199, %r196, %r198;
shr.s32 %r200, %r199, 5;
$L__tmp2:
.loc 2 21 0
add.s32 %r201, %r120, 63;
.loc 2 21 28
shr.s32 %r202, %r201, 31;
shr.u32 %r203, %r202, 26;
add.s32 %r204, %r201, %r203;
shr.s32 %r205, %r204, 6;
$L__tmp3:
.loc 1 193 38
shl.b32 %r207, %r205, 3;
.loc 1 194 22
div.s32 %r208, %r195, %r207;
.loc 1 195 29
shl.b32 %r209, %r208, 3;
.loc 1 196 35
sub.s32 %r210, %r200, %r209;
$L__tmp4:
.loc 3 1385 27
min.s32 %r211, %r210, 8;
$L__tmp5:
.loc 1 197 33
rem.s32 %r212, %r195, %r211;
.loc 1 197 27
add.s32 %r213, %r209, %r212;
mul.lo.s32 %r214, %r208, %r207;
sub.s32 %r215, %r195, %r214;
.loc 1 198 40
div.s32 %r216, %r215, %r211;
.loc 1 207 23
shl.b32 %r217, %r213, 5;
.loc 1 207 38
or.b32 %r10, %r217, %r4;
or.b32 %r11, %r10, 8;
or.b32 %r12, %r10, 16;
or.b32 %r13, %r10, 24;
.loc 1 207 68
rem.s32 %r218, %r10, %r119;
rem.s32 %r219, %r11, %r119;
rem.s32 %r220, %r12, %r119;
rem.s32 %r221, %r13, %r119;
.loc 1 208 23
shl.b32 %r14, %r216, 6;
.loc 1 208 38
or.b32 %r222, %r14, %r193;
or.b32 %r223, %r222, 16;
or.b32 %r224, %r222, 32;
or.b32 %r225, %r222, 48;
.loc 1 208 68
rem.s32 %r226, %r222, %r120;
rem.s32 %r227, %r223, %r120;
rem.s32 %r228, %r224, %r120;
rem.s32 %r229, %r225, %r120;
.loc 1 210 53
mad.lo.s32 %r230, %r218, %r191, %r7;
mad.lo.s32 %r231, %r219, %r191, %r7;
mad.lo.s32 %r232, %r220, %r191, %r7;
mad.lo.s32 %r233, %r221, %r191, %r7;
.loc 1 210 22
mul.wide.s32 %rd48, %r230, 2;
add.s64 %rd32, %rd27, %rd48;
mul.wide.s32 %rd49, %r231, 2;
add.s64 %rd33, %rd27, %rd49;
mul.wide.s32 %rd50, %r232, 2;
add.s64 %rd34, %rd27, %rd50;
mul.wide.s32 %rd51, %r233, 2;
add.s64 %rd35, %rd27, %rd51;
.loc 1 211 52
mad.lo.s32 %r234, %r226, %r192, %r9;
mad.lo.s32 %r235, %r227, %r192, %r9;
mad.lo.s32 %r236, %r228, %r192, %r9;
mad.lo.s32 %r237, %r229, %r192, %r9;
.loc 1 211 22
cvt.s64.s32 %rd5, %r234;
add.s64 %rd36, %rd88, %rd5;
cvt.s64.s32 %rd6, %r235;
add.s64 %rd37, %rd88, %rd6;
cvt.s64.s32 %rd7, %r236;
add.s64 %rd38, %rd88, %rd7;
cvt.s64.s32 %rd8, %r237;
add.s64 %rd39, %rd88, %rd8;
$L__tmp6:
.loc 2 21 0
add.s32 %r238, %r121, 63;
$L__tmp7:
.loc 1 223 22
setp.lt.s32 %p1, %r238, 64;
setp.gt.s32 %p2, %r238, 63;
.loc 1 226 51
setp.lt.s32 %p3, %r7, %r121;
.loc 1 226 20
xor.b32 %r16, %r4, %r194;
shl.b32 %r242, %r16, 3;
and.b32 %r243, %r242, 48;
shl.b32 %r244, %r1, 3;
and.b32 %r245, %r244, 8;
or.b32 %r246, %r245, %r243;
shl.b32 %r247, %r4, 7;
shl.b32 %r248, %r246, 1;
or.b32 %r249, %r247, %r248;
mov.u32 %r250, global_smem;
add.s32 %r123, %r250, %r249;
add.s32 %r125, %r123, 1024;
add.s32 %r127, %r123, 2048;
add.s32 %r129, %r123, 3072;
selp.b32 %r251, 16, 0, %p2;
selp.b32 %r126, %r251, 0, %p3;
cp.async.cg.shared.global [ %r123 + 0 ], [ %rd32 + 0 ], 0x10, %r126;
cp.async.cg.shared.global [ %r125 + 0 ], [ %rd33 + 0 ], 0x10, %r126;
cp.async.cg.shared.global [ %r127 + 0 ], [ %rd34 + 0 ], 0x10, %r126;
cp.async.cg.shared.global [ %r129 + 0 ], [ %rd35 + 0 ], 0x10, %r126;
cp.async.commit_group ;
.loc 1 227 51
setp.lt.s32 %p4, %r9, %r121;
.loc 1 227 20
shr.u32 %r252, %r1, 3;
shl.b32 %r253, %r193, 6;
xor.b32 %r254, %r252, %r1;
shl.b32 %r255, %r254, 4;
and.b32 %r17, %r255, 48;
or.b32 %r256, %r253, %r17;
add.s32 %r794, %r250, 12288;
add.s32 %r131, %r794, %r256;
add.s32 %r133, %r131, 1024;
add.s32 %r135, %r131, 2048;
add.s32 %r137, %r131, 3072;
selp.b32 %r134, %r251, 0, %p4;
cp.async.cg.shared.global [ %r131 + 0 ], [ %rd36 + 0 ], 0x10, %r134;
cp.async.cg.shared.global [ %r133 + 0 ], [ %rd37 + 0 ], 0x10, %r134;
cp.async.cg.shared.global [ %r135 + 0 ], [ %rd38 + 0 ], 0x10, %r134;
cp.async.cg.shared.global [ %r137 + 0 ], [ %rd39 + 0 ], 0x10, %r134;
cp.async.commit_group ;
.loc 1 231 18
add.s64 %rd40, %rd32, 128;
add.s64 %rd41, %rd33, 128;
add.s64 %rd42, %rd34, 128;
add.s64 %rd43, %rd35, 128;
.loc 1 232 18
add.s64 %rd44, %rd36, 64;
add.s64 %rd45, %rd37, 64;
add.s64 %rd46, %rd38, 64;
add.s64 %rd47, %rd39, 64;
.loc 1 223 22
setp.gt.s32 %p5, %r238, 127;
.loc 1 226 55
add.s32 %r258, %r121, -64;
.loc 1 226 51
setp.lt.s32 %p6, %r7, %r258;
.loc 1 226 20
bar.sync 0;
add.s32 %r139, %r123, 4096;
add.s32 %r141, %r123, 5120;
add.s32 %r143, %r123, 6144;
add.s32 %r145, %r123, 7168;
selp.b32 %r259, 16, 0, %p5;
selp.b32 %r142, %r259, 0, %p6;
cp.async.cg.shared.global [ %r139 + 0 ], [ %rd40 + 0 ], 0x10, %r142;
cp.async.cg.shared.global [ %r141 + 0 ], [ %rd41 + 0 ], 0x10, %r142;
cp.async.cg.shared.global [ %r143 + 0 ], [ %rd42 + 0 ], 0x10, %r142;
cp.async.cg.shared.global [ %r145 + 0 ], [ %rd43 + 0 ], 0x10, %r142;
cp.async.commit_group ;
.loc 1 227 51
setp.lt.s32 %p7, %r9, %r258;
.loc 1 227 20
add.s32 %r260, %r250, %r256;
add.s32 %r147, %r260, 16384;
add.s32 %r149, %r260, 17408;
add.s32 %r151, %r260, 18432;
add.s32 %r153, %r260, 19456;
selp.b32 %r150, %r259, 0, %p7;
cp.async.cg.shared.global [ %r147 + 0 ], [ %rd44 + 0 ], 0x10, %r150;
cp.async.cg.shared.global [ %r149 + 0 ], [ %rd45 + 0 ], 0x10, %r150;
cp.async.cg.shared.global [ %r151 + 0 ], [ %rd46 + 0 ], 0x10, %r150;
cp.async.cg.shared.global [ %r153 + 0 ], [ %rd47 + 0 ], 0x10, %r150;
cp.async.commit_group ;
.loc 1 226 20
cp.async.wait_group 0x2;
bar.sync 0;
shl.b32 %r18, %r8, 2;
.loc 1 227 20
bfe.u32 %r38, %r189, 3, 1;
bfe.u32 %r39, %r1, 1, 2;
and.b32 %r40, %r1, 23;
or.b32 %r41, %r40, %r6;
xor.b32 %r267, %r38, %r39;
shl.b32 %r268, %r41, 6;
shl.b32 %r42, %r267, 4;
or.b32 %r269, %r268, %r42;
add.s32 %r159, %r794, %r269;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r155, %r156, %r157, %r158 }, [ %r159 + 0 ];
add.s32 %r164, %r159, 2048;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r160, %r161, %r162, %r163 }, [ %r164 + 0 ];
mov.f32 %f417, 0f00000000;
mov.f32 %f418, %f417;
mov.f32 %f419, %f417;
mov.f32 %f420, %f417;
mov.f32 %f421, %f417;
mov.f32 %f422, %f417;
mov.f32 %f423, %f417;
mov.f32 %f424, %f417;
mov.f32 %f425, %f417;
mov.f32 %f426, %f417;
mov.f32 %f427, %f417;
mov.f32 %f428, %f417;
mov.f32 %f429, %f417;
mov.f32 %f430, %f417;
mov.f32 %f431, %f417;
mov.f32 %f432, %f417;
mov.f32 %f433, %f417;
mov.f32 %f434, %f417;
mov.f32 %f435, %f417;
mov.f32 %f436, %f417;
mov.f32 %f437, %f417;
mov.f32 %f438, %f417;
mov.f32 %f439, %f417;
mov.f32 %f440, %f417;
mov.f32 %f441, %f417;
mov.f32 %f442, %f417;
mov.f32 %f443, %f417;
mov.f32 %f444, %f417;
mov.f32 %f445, %f417;
mov.f32 %f446, %f417;
mov.f32 %f447, %f417;
mov.f32 %f448, %f417;
.loc 1 223 22
@%p1 bra $L__BB0_3;
.loc 1 0 0
cvt.s64.s32 %rd1, %r230;
cvt.s64.s32 %rd2, %r231;
cvt.s64.s32 %rd3, %r232;
cvt.s64.s32 %rd4, %r233;
shr.s32 %r239, %r238, 31;
shr.u32 %r240, %r239, 26;
add.s32 %r241, %r238, %r240;
shr.s32 %r15, %r241, 6;
shl.b32 %r261, %r3, 4;
shl.b32 %r19, %r5, 6;
or.b32 %r262, %r261, %r19;
or.b32 %r20, %r262, %r18;
shl.b32 %r263, %r20, 1;
add.s32 %r264, %r250, %r263;
ld.shared.v2.u32 {%r791, %r790}, [%r264];
ld.shared.v2.u32 {%r789, %r788}, [%r264+1024];
xor.b32 %r25, %r20, 16;
shl.b32 %r265, %r25, 1;
add.s32 %r266, %r250, %r265;
ld.shared.v2.u32 {%r787, %r786}, [%r266];
ld.shared.v2.u32 {%r785, %r784}, [%r266+1024];
ld.shared.v2.u32 {%r783, %r782}, [%r264+2048];
ld.shared.v2.u32 {%r781, %r780}, [%r264+3072];
ld.shared.v2.u32 {%r779, %r778}, [%r266+2048];
ld.shared.v2.u32 {%r777, %r776}, [%r266+3072];
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r155;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r165, f0, f1, 0x7632;
prmt.b32 %r166, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs356}, %r166; }
cvt.u16.u32 %rs355, %r166;
{ .reg .b16 tmp; mov.b32 {tmp, %rs354}, %r165; }
cvt.u16.u32 %rs353, %r165;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r156;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r168, f0, f1, 0x7632;
prmt.b32 %r169, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs348}, %r169; }
cvt.u16.u32 %rs347, %r169;
{ .reg .b16 tmp; mov.b32 {tmp, %rs346}, %r168; }
cvt.u16.u32 %rs345, %r168;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r157;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r171, f0, f1, 0x7632;
prmt.b32 %r172, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs352}, %r172; }
cvt.u16.u32 %rs351, %r172;
{ .reg .b16 tmp; mov.b32 {tmp, %rs350}, %r171; }
cvt.u16.u32 %rs349, %r171;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r158;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r174, f0, f1, 0x7632;
prmt.b32 %r175, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs344}, %r175; }
cvt.u16.u32 %rs343, %r175;
{ .reg .b16 tmp; mov.b32 {tmp, %rs342}, %r174; }
cvt.u16.u32 %rs341, %r174;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r160;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r177, f0, f1, 0x7632;
prmt.b32 %r178, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs340}, %r178; }
cvt.u16.u32 %rs339, %r178;
{ .reg .b16 tmp; mov.b32 {tmp, %rs338}, %r177; }
cvt.u16.u32 %rs337, %r177;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r161;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r180, f0, f1, 0x7632;
prmt.b32 %r181, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs332}, %r181; }
cvt.u16.u32 %rs331, %r181;
{ .reg .b16 tmp; mov.b32 {tmp, %rs330}, %r180; }
cvt.u16.u32 %rs329, %r180;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r162;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r183, f0, f1, 0x7632;
prmt.b32 %r184, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs336}, %r184; }
cvt.u16.u32 %rs335, %r184;
{ .reg .b16 tmp; mov.b32 {tmp, %rs334}, %r183; }
cvt.u16.u32 %rs333, %r183;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r163;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r186, f0, f1, 0x7632;
prmt.b32 %r187, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs328}, %r187; }
cvt.u16.u32 %rs327, %r187;
{ .reg .b16 tmp; mov.b32 {tmp, %rs326}, %r186; }
cvt.u16.u32 %rs325, %r186;
.loc 1 223 22
shl.b32 %r284, %r2, 9;
shl.b32 %r285, %r40, 6;
.loc 1 229 38
or.b32 %r286, %r285, %r42;
or.b32 %r43, %r286, 18432;
.loc 1 223 22
add.s32 %r44, %r250, %r284;
.loc 1 229 38
or.b32 %r45, %r286, 16384;
.loc 1 223 22
add.s32 %r773, %r121, -128;
.loc 1 229 38
add.s32 %r289, %r19, %r17;
or.b32 %r49, %r289, 20480;
or.b32 %r50, %r289, 23552;
or.b32 %r51, %r289, 22528;
or.b32 %r52, %r289, 21504;
.loc 1 223 22
shl.b32 %r290, %r3, 7;
shl.b32 %r291, %r16, 4;
and.b32 %r292, %r291, 96;
.loc 1 229 38
or.b32 %r293, %r290, %r292;
.loc 1 223 22
cvt.u16.u32 %rs97, %r1;
and.b16 %rs98, %rs97, 1;
mul.wide.u16 %r294, %rs98, 16;
.loc 1 229 38
or.b32 %r295, %r293, %r294;
or.b32 %r53, %r295, 8192;
or.b32 %r54, %r295, 11264;
or.b32 %r55, %r295, 10240;
or.b32 %r56, %r295, 9216;
.loc 1 223 22
add.s64 %rd9, %rd8, 128;
add.s64 %rd10, %rd7, 128;
add.s64 %rd11, %rd6, 128;
add.s64 %rd12, %rd5, 128;
shl.b64 %rd52, %rd4, 1;
add.s64 %rd53, %rd52, %rd27;
add.s64 %rd87, %rd53, 256;
shl.b64 %rd54, %rd3, 1;
add.s64 %rd55, %rd54, %rd27;
add.s64 %rd86, %rd55, 256;
shl.b64 %rd56, %rd2, 1;
add.s64 %rd57, %rd56, %rd27;
add.s64 %rd85, %rd57, 256;
shl.b64 %rd58, %rd1, 1;
add.s64 %rd59, %rd58, %rd27;
add.s64 %rd84, %rd59, 256;
mov.f32 %f417, 0f00000000;
mov.u32 %r775, 0;
mov.u32 %r800, 1;
mov.u32 %r799, 64;
mov.u32 %r793, 2;
mov.u32 %r774, 16384;
mov.u32 %r792, %r800;
mov.u32 %r801, %r775;
mov.u32 %r798, %r250;
mov.f32 %f418, %f417;
mov.f32 %f419, %f417;
mov.f32 %f420, %f417;
mov.f32 %f421, %f417;
mov.f32 %f422, %f417;
mov.f32 %f423, %f417;
mov.f32 %f424, %f417;
mov.f32 %f425, %f417;
mov.f32 %f426, %f417;
mov.f32 %f427, %f417;
mov.f32 %f428, %f417;
mov.f32 %f429, %f417;
mov.f32 %f430, %f417;
mov.f32 %f431, %f417;
mov.f32 %f432, %f417;
mov.f32 %f433, %f417;
mov.f32 %f434, %f417;
mov.f32 %f435, %f417;
mov.f32 %f436, %f417;
mov.f32 %f437, %f417;
mov.f32 %f438, %f417;
mov.f32 %f439, %f417;
mov.f32 %f440, %f417;
mov.f32 %f441, %f417;
mov.f32 %f442, %f417;
mov.f32 %f443, %f417;
mov.f32 %f444, %f417;
mov.f32 %f445, %f417;
mov.f32 %f446, %f417;
mov.f32 %f447, %f417;
mov.f32 %f448, %f417;
$L__BB0_2:
.loc 1 229 38
mul.hi.u32 %r572, %r792, -1431655765;
shr.u32 %r573, %r572, 1;
mul.lo.s32 %r574, %r573, 12288;
sub.s32 %r575, %r43, %r574;
sub.s32 %r576, %r45, %r574;
sub.s32 %r577, %r774, %r574;
mul.hi.u32 %r578, %r793, -1431655765;
shr.u32 %r579, %r578, 1;
mul.lo.s32 %r580, %r579, 12288;
sub.s32 %r581, %r49, %r580;
sub.s32 %r582, %r50, %r580;
sub.s32 %r583, %r51, %r580;
sub.s32 %r584, %r52, %r580;
sub.s32 %r585, %r53, %r580;
sub.s32 %r586, %r54, %r580;
sub.s32 %r587, %r55, %r580;
sub.s32 %r588, %r56, %r580;
cvt.u32.u16 %r589, %rs325;
cvt.u32.u16 %r590, %rs326;
shl.b32 %r591, %r590, 16;
or.b32 %r424, %r589, %r591;
cvt.u32.u16 %r592, %rs327;
cvt.u32.u16 %r593, %rs328;
shl.b32 %r594, %r593, 16;
or.b32 %r425, %r592, %r594;
cvt.u32.u16 %r595, %rs329;
cvt.u32.u16 %r596, %rs330;
shl.b32 %r597, %r596, 16;
or.b32 %r418, %r595, %r597;
cvt.u32.u16 %r598, %rs331;
cvt.u32.u16 %r599, %rs332;
shl.b32 %r600, %r599, 16;
or.b32 %r419, %r598, %r600;
cvt.u32.u16 %r601, %rs333;
cvt.u32.u16 %r602, %rs334;
shl.b32 %r603, %r602, 16;
or.b32 %r376, %r601, %r603;
cvt.u32.u16 %r604, %rs335;
cvt.u32.u16 %r605, %rs336;
shl.b32 %r606, %r605, 16;
or.b32 %r377, %r604, %r606;
cvt.u32.u16 %r607, %rs337;
cvt.u32.u16 %r608, %rs338;
shl.b32 %r609, %r608, 16;
or.b32 %r370, %r607, %r609;
cvt.u32.u16 %r610, %rs339;
cvt.u32.u16 %r611, %rs340;
shl.b32 %r612, %r611, 16;
or.b32 %r371, %r610, %r612;
cvt.u32.u16 %r613, %rs341;
cvt.u32.u16 %r614, %rs342;
shl.b32 %r615, %r614, 16;
or.b32 %r412, %r613, %r615;
cvt.u32.u16 %r616, %rs343;
cvt.u32.u16 %r617, %rs344;
shl.b32 %r618, %r617, 16;
or.b32 %r413, %r616, %r618;
cvt.u32.u16 %r619, %rs345;
cvt.u32.u16 %r620, %rs346;
shl.b32 %r621, %r620, 16;
or.b32 %r406, %r619, %r621;
cvt.u32.u16 %r622, %rs347;
cvt.u32.u16 %r623, %rs348;
shl.b32 %r624, %r623, 16;
or.b32 %r407, %r622, %r624;
cvt.u32.u16 %r625, %rs349;
cvt.u32.u16 %r626, %rs350;
shl.b32 %r627, %r626, 16;
or.b32 %r364, %r625, %r627;
cvt.u32.u16 %r628, %rs351;
cvt.u32.u16 %r629, %rs352;
shl.b32 %r630, %r629, 16;
or.b32 %r365, %r628, %r630;
cvt.u32.u16 %r631, %rs353;
cvt.u32.u16 %r632, %rs354;
shl.b32 %r633, %r632, 16;
or.b32 %r358, %r631, %r633;
cvt.u32.u16 %r634, %rs355;
cvt.u32.u16 %r635, %rs356;
shl.b32 %r636, %r635, 16;
or.b32 %r359, %r634, %r636;
.loc 1 226 20
add.s32 %r637, %r801, 32;
shl.b32 %r638, %r800, 6;
add.s32 %r639, %r798, %r638;
shr.u32 %r640, %r637, 4;
xor.b32 %r641, %r640, %r3;
shl.b32 %r642, %r641, 4;
mad.lo.s32 %r643, %r799, %r5, %r18;
add.s32 %r644, %r643, %r642;
mov.u32 %r645, -32;
sub.s32 %r646, %r645, %r801;
shl.b32 %r647, %r646, 1;
add.s32 %r648, %r639, %r647;
shl.b32 %r649, %r644, 1;
add.s32 %r650, %r648, %r649;
shl.b32 %r651, %r799, 4;
add.s32 %r652, %r650, %r651;
ld.shared.v2.u32 {%r426, %r428}, [%r650];
ld.shared.v2.u32 {%r427, %r429}, [%r652];
add.s32 %r653, %r640, 1;
xor.b32 %r654, %r653, %r3;
shl.b32 %r655, %r654, 4;
add.s32 %r656, %r643, %r655;
shl.b32 %r657, %r656, 1;
add.s32 %r658, %r648, %r657;
add.s32 %r659, %r658, %r651;
ld.shared.v2.u32 {%r474, %r476}, [%r658];
ld.shared.v2.u32 {%r475, %r477}, [%r659];
shl.b32 %r660, %r799, 5;
add.s32 %r661, %r650, %r660;
mul.lo.s32 %r662, %r799, 48;
add.s32 %r663, %r650, %r662;
ld.shared.v2.u32 {%r450, %r452}, [%r661];
ld.shared.v2.u32 {%r451, %r453}, [%r663];
add.s32 %r664, %r658, %r660;
add.s32 %r665, %r658, %r662;
ld.shared.v2.u32 {%r498, %r500}, [%r664];
ld.shared.v2.u32 {%r499, %r501}, [%r665];
.loc 1 227 20
add.s32 %r666, %r801, 32;
shl.b32 %r667, %r800, 5;
add.s32 %r668, %r794, %r667;
shr.u32 %r669, %r666, 4;
add.s32 %r670, %r669, %r38;
xor.b32 %r671, %r670, %r39;
shl.b32 %r672, %r671, 4;
mad.lo.s32 %r673, %r799, %r41, %r672;
sub.s32 %r674, %r668, %r801;
add.s32 %r675, %r674, %r673;
add.s32 %r300, %r675, -32;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r296, %r297, %r298, %r299 }, [ %r300 + 0 ];
shl.b32 %r676, %r799, 5;
add.s32 %r305, %r300, %r676;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r301, %r302, %r303, %r304 }, [ %r305 + 0 ];
.loc 1 229 38
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r296;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r430, f0, f1, 0x7632;
prmt.b32 %r431, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r297;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r478, f0, f1, 0x7632;
prmt.b32 %r479, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r298;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r436, f0, f1, 0x7632;
prmt.b32 %r437, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r299;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r484, f0, f1, 0x7632;
prmt.b32 %r485, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r301;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r442, f0, f1, 0x7632;
prmt.b32 %r443, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r302;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r490, f0, f1, 0x7632;
prmt.b32 %r491, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r303;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r448, f0, f1, 0x7632;
prmt.b32 %r449, f2, f3, 0x7632;
}
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r304;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r496, f0, f1, 0x7632;
prmt.b32 %r497, f2, f3, 0x7632;
}
.loc 1 229 0
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r791, %r789, %r790, %r788 }, { %r358, %r359 }, { %f417, %f418, %f419, %f420 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r791, %r789, %r790, %r788 }, { %r364, %r365 }, { %f421, %f422, %f423, %f424 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r791, %r789, %r790, %r788 }, { %r370, %r371 }, { %f425, %f426, %f427, %f428 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r791, %r789, %r790, %r788 }, { %r376, %r377 }, { %f429, %f430, %f431, %f432 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r783, %r781, %r782, %r780 }, { %r358, %r359 }, { %f433, %f434, %f435, %f436 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r783, %r781, %r782, %r780 }, { %r364, %r365 }, { %f437, %f438, %f439, %f440 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r783, %r781, %r782, %r780 }, { %r370, %r371 }, { %f441, %f442, %f443, %f444 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r783, %r781, %r782, %r780 }, { %r376, %r377 }, { %f445, %f446, %f447, %f448 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r787, %r785, %r786, %r784 }, { %r406, %r407 }, { %f417, %f418, %f419, %f420 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r787, %r785, %r786, %r784 }, { %r412, %r413 }, { %f421, %f422, %f423, %f424 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r787, %r785, %r786, %r784 }, { %r418, %r419 }, { %f425, %f426, %f427, %f428 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r787, %r785, %r786, %r784 }, { %r424, %r425 }, { %f429, %f430, %f431, %f432 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r779, %r777, %r778, %r776 }, { %r406, %r407 }, { %f433, %f434, %f435, %f436 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r779, %r777, %r778, %r776 }, { %r412, %r413 }, { %f437, %f438, %f439, %f440 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r779, %r777, %r778, %r776 }, { %r418, %r419 }, { %f441, %f442, %f443, %f444 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r779, %r777, %r778, %r776 }, { %r424, %r425 }, { %f445, %f446, %f447, %f448 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r426, %r427, %r428, %r429 }, { %r430, %r431 }, { %f417, %f418, %f419, %f420 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r426, %r427, %r428, %r429 }, { %r436, %r437 }, { %f421, %f422, %f423, %f424 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r426, %r427, %r428, %r429 }, { %r442, %r443 }, { %f425, %f426, %f427, %f428 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r426, %r427, %r428, %r429 }, { %r448, %r449 }, { %f429, %f430, %f431, %f432 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r450, %r451, %r452, %r453 }, { %r430, %r431 }, { %f433, %f434, %f435, %f436 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r450, %r451, %r452, %r453 }, { %r436, %r437 }, { %f437, %f438, %f439, %f440 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r450, %r451, %r452, %r453 }, { %r442, %r443 }, { %f441, %f442, %f443, %f444 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r450, %r451, %r452, %r453 }, { %r448, %r449 }, { %f445, %f446, %f447, %f448 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r474, %r475, %r476, %r477 }, { %r478, %r479 }, { %f417, %f418, %f419, %f420 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r474, %r475, %r476, %r477 }, { %r484, %r485 }, { %f421, %f422, %f423, %f424 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r474, %r475, %r476, %r477 }, { %r490, %r491 }, { %f425, %f426, %f427, %f428 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r474, %r475, %r476, %r477 }, { %r496, %r497 }, { %f429, %f430, %f431, %f432 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r498, %r499, %r500, %r501 }, { %r478, %r479 }, { %f433, %f434, %f435, %f436 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r498, %r499, %r500, %r501 }, { %r484, %r485 }, { %f437, %f438, %f439, %f440 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r498, %r499, %r500, %r501 }, { %r490, %r491 }, { %f441, %f442, %f443, %f444 };
mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r498, %r499, %r500, %r501 }, { %r496, %r497 }, { %f445, %f446, %f447, %f448 };
.loc 1 223 22
add.s32 %r792, %r792, 1;
setp.lt.s32 %p8, %r793, %r15;
.loc 1 226 51
setp.lt.s32 %p9, %r7, %r773;
.loc 1 227 51
setp.lt.s32 %p10, %r9, %r773;
.loc 1 232 18
add.s64 %rd64, %rd88, %rd12;
add.s64 %rd65, %rd88, %rd11;
add.s64 %rd66, %rd88, %rd10;
.loc 1 226 20
add.s64 %rd67, %rd88, %rd9;
bar.sync 0;
add.s32 %r677, %r775, %r585;
add.s32 %r522, %r44, %r677;
add.s32 %r678, %r775, %r588;
add.s32 %r524, %r44, %r678;
add.s32 %r679, %r775, %r587;
add.s32 %r526, %r44, %r679;
add.s32 %r680, %r775, %r586;
add.s32 %r528, %r44, %r680;
selp.b32 %r681, 16, 0, %p9;
selp.b32 %r525, %r681, 0, %p8;
cp.async.cg.shared.global [ %r522 + 0 ], [ %rd84 + 0 ], 0x10, %r525;
cp.async.cg.shared.global [ %r524 + 0 ], [ %rd85 + 0 ], 0x10, %r525;
cp.async.cg.shared.global [ %r526 + 0 ], [ %rd86 + 0 ], 0x10, %r525;
cp.async.cg.shared.global [ %r528 + 0 ], [ %rd87 + 0 ], 0x10, %r525;
cp.async.commit_group ;
.loc 1 227 20
add.s32 %r682, %r775, %r581;
add.s32 %r530, %r44, %r682;
add.s32 %r683, %r775, %r584;
add.s32 %r532, %r44, %r683;
add.s32 %r684, %r775, %r583;
add.s32 %r534, %r44, %r684;
add.s32 %r685, %r775, %r582;
add.s32 %r536, %r44, %r685;
selp.b32 %r686, 16, 0, %p10;
selp.b32 %r533, %r686, 0, %p8;
cp.async.cg.shared.global [ %r530 + 0 ], [ %rd64 + 0 ], 0x10, %r533;
cp.async.cg.shared.global [ %r532 + 0 ], [ %rd65 + 0 ], 0x10, %r533;
cp.async.cg.shared.global [ %r534 + 0 ], [ %rd66 + 0 ], 0x10, %r533;
cp.async.cg.shared.global [ %r536 + 0 ], [ %rd67 + 0 ], 0x10, %r533;
cp.async.commit_group ;
.loc 1 226 20
cp.async.wait_group 0x2;
bar.sync 0;
sub.s32 %r687, %r775, %r574;
add.s32 %r688, %r687, 4096;
add.s32 %r798, %r250, %r688;
mov.u32 %r801, 0;
.loc 1 227 20
add.s32 %r794, %r250, %r577;
.loc 1 223 22
add.s32 %r99, %r793, 1;
.loc 1 226 20
add.s32 %r690, %r264, %r688;
add.s32 %r691, %r687, 5120;
add.s32 %r692, %r264, %r691;
ld.shared.v2.u32 {%r791, %r790}, [%r690];
ld.shared.v2.u32 {%r789, %r788}, [%r692];
add.s32 %r693, %r266, %r688;
add.s32 %r694, %r266, %r691;
ld.shared.v2.u32 {%r787, %r786}, [%r693];
ld.shared.v2.u32 {%r785, %r784}, [%r694];
add.s32 %r695, %r687, 6144;
add.s32 %r696, %r264, %r695;
add.s32 %r697, %r687, 7168;
add.s32 %r698, %r264, %r697;
ld.shared.v2.u32 {%r783, %r782}, [%r696];
ld.shared.v2.u32 {%r781, %r780}, [%r698];
add.s32 %r699, %r266, %r695;
add.s32 %r700, %r266, %r697;
ld.shared.v2.u32 {%r779, %r778}, [%r699];
ld.shared.v2.u32 {%r777, %r776}, [%r700];
.loc 1 227 20
add.s32 %r701, %r775, %r576;
add.s32 %r542, %r44, %r701;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r538, %r539, %r540, %r541 }, [ %r542 + 0 ];
add.s32 %r702, %r775, %r575;
add.s32 %r547, %r44, %r702;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r543, %r544, %r545, %r546 }, [ %r547 + 0 ];
.loc 1 229 38
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r538;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r548, f0, f1, 0x7632;
prmt.b32 %r549, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs356}, %r549; }
cvt.u16.u32 %rs355, %r549;
{ .reg .b16 tmp; mov.b32 {tmp, %rs354}, %r548; }
cvt.u16.u32 %rs353, %r548;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r539;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r551, f0, f1, 0x7632;
prmt.b32 %r552, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs348}, %r552; }
cvt.u16.u32 %rs347, %r552;
{ .reg .b16 tmp; mov.b32 {tmp, %rs346}, %r551; }
cvt.u16.u32 %rs345, %r551;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r540;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r554, f0, f1, 0x7632;
prmt.b32 %r555, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs352}, %r555; }
cvt.u16.u32 %rs351, %r555;
{ .reg .b16 tmp; mov.b32 {tmp, %rs350}, %r554; }
cvt.u16.u32 %rs349, %r554;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r541;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r557, f0, f1, 0x7632;
prmt.b32 %r558, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs344}, %r558; }
cvt.u16.u32 %rs343, %r558;
{ .reg .b16 tmp; mov.b32 {tmp, %rs342}, %r557; }
cvt.u16.u32 %rs341, %r557;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r543;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r560, f0, f1, 0x7632;
prmt.b32 %r561, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs340}, %r561; }
cvt.u16.u32 %rs339, %r561;
{ .reg .b16 tmp; mov.b32 {tmp, %rs338}, %r560; }
cvt.u16.u32 %rs337, %r560;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r544;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r563, f0, f1, 0x7632;
prmt.b32 %r564, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs332}, %r564; }
cvt.u16.u32 %rs331, %r564;
{ .reg .b16 tmp; mov.b32 {tmp, %rs330}, %r563; }
cvt.u16.u32 %rs329, %r563;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r545;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r566, f0, f1, 0x7632;
prmt.b32 %r567, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs336}, %r567; }
cvt.u16.u32 %rs335, %r567;
{ .reg .b16 tmp; mov.b32 {tmp, %rs334}, %r566; }
cvt.u16.u32 %rs333, %r566;
{
.reg .s8 s<4>;
.reg .f32 f<4>;
mov.b32 {s0, s1, s2, s3}, %r546;
cvt.rn.f32.s8 f0, s0;
cvt.rn.f32.s8 f1, s1;
cvt.rn.f32.s8 f2, s2;
cvt.rn.f32.s8 f3, s3;
prmt.b32 %r569, f0, f1, 0x7632;
prmt.b32 %r570, f2, f3, 0x7632;
}
{ .reg .b16 tmp; mov.b32 {tmp, %rs328}, %r570; }
cvt.u16.u32 %rs327, %r570;
{ .reg .b16 tmp; mov.b32 {tmp, %rs326}, %r569; }
cvt.u16.u32 %rs325, %r569;
.loc 1 223 22
add.s32 %r775, %r775, 4096;
add.s32 %r774, %r774, 4096;
add.s32 %r773, %r773, -64;
add.s64 %rd88, %rd88, 64;
add.s64 %rd87, %rd87, 128;
add.s64 %rd86, %rd86, 128;
add.s64 %rd85, %rd85, 128;
add.s64 %rd84, %rd84, 128;
add.s32 %r703, %r793, -1;
setp.lt.s32 %p11, %r703, %r15;
mov.u32 %r793, %r99;
@%p11 bra $L__BB0_2;
$L__BB0_3:
.loc 1 208 51
and.b32 %r752, %r1, 63;
.loc 1 208 38
or.b32 %r753, %r14, %r752;
.loc 1 208 68
rem.s32 %r754, %r753, %r120;
.loc 1 212 22
mul.wide.s32 %rd74, %r754, 2;
add.s64 %rd69, %rd29, %rd74;
.loc 1 208 38
or.b32 %r755, %r14, %r7;
.loc 1 223 22
cp.async.wait_group 0x0;
bar.sync 0;
mov.pred %p12, -1;
.loc 1 234 16
mov.u16 %rs135, 0x0;
@%p12 ld.global.b16 { %rs135 }, [ %rd68 + 0 ];
.loc 1 235 16
mov.u16 %rs100, 0x0;
@%p12 ld.global.b16 { %rs100 }, [ %rd69 + 0 ];
.loc 1 236 24
mov.b32 %r704, %f417;
cvt.rn.bf16.f32 %rs134, %r704;
mov.b32 %r705, %f418;
cvt.rn.bf16.f32 %rs137, %r705;
mov.b32 %r706, %f419;
cvt.rn.bf16.f32 %rs140, %r706;
mov.b32 %r707, %f420;
cvt.rn.bf16.f32 %rs143, %r707;
mov.b32 %r708, %f421;
cvt.rn.bf16.f32 %rs146, %r708;
mov.b32 %r709, %f422;
cvt.rn.bf16.f32 %rs149, %r709;
mov.b32 %r710, %f423;
cvt.rn.bf16.f32 %rs152, %r710;
mov.b32 %r711, %f424;
cvt.rn.bf16.f32 %rs155, %r711;
mov.b32 %r712, %f425;
cvt.rn.bf16.f32 %rs158, %r712;
mov.b32 %r713, %f426;
cvt.rn.bf16.f32 %rs161, %r713;
mov.b32 %r714, %f427;
cvt.rn.bf16.f32 %rs164, %r714;
mov.b32 %r715, %f428;
cvt.rn.bf16.f32 %rs167, %r715;
mov.b32 %r716, %f429;
cvt.rn.bf16.f32 %rs170, %r716;
mov.b32 %r717, %f430;
cvt.rn.bf16.f32 %rs173, %r717;
mov.b32 %r718, %f431;
cvt.rn.bf16.f32 %rs176, %r718;
mov.b32 %r719, %f432;
cvt.rn.bf16.f32 %rs179, %r719;
mov.b32 %r720, %f433;
cvt.rn.bf16.f32 %rs182, %r720;
mov.b32 %r721, %f434;
cvt.rn.bf16.f32 %rs185, %r721;
mov.b32 %r722, %f435;
cvt.rn.bf16.f32 %rs188, %r722;
mov.b32 %r723, %f436;
cvt.rn.bf16.f32 %rs191, %r723;
mov.b32 %r724, %f437;
cvt.rn.bf16.f32 %rs194, %r724;
mov.b32 %r725, %f438;
cvt.rn.bf16.f32 %rs197, %r725;
mov.b32 %r726, %f439;
cvt.rn.bf16.f32 %rs200, %r726;
mov.b32 %r727, %f440;
cvt.rn.bf16.f32 %rs203, %r727;
mov.b32 %r728, %f441;
cvt.rn.bf16.f32 %rs206, %r728;
mov.b32 %r729, %f442;
cvt.rn.bf16.f32 %rs209, %r729;
mov.b32 %r730, %f443;
cvt.rn.bf16.f32 %rs212, %r730;
mov.b32 %r731, %f444;
cvt.rn.bf16.f32 %rs215, %r731;
mov.b32 %r732, %f445;
cvt.rn.bf16.f32 %rs218, %r732;
mov.b32 %r733, %f446;
cvt.rn.bf16.f32 %rs221, %r733;
mov.b32 %r734, %f447;
cvt.rn.bf16.f32 %rs224, %r734;
mov.b32 %r735, %f448;
cvt.rn.bf16.f32 %rs227, %r735;
.loc 1 236 39
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs230, %rs134, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs233, %rs137, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs236, %rs140, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs239, %rs143, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs242, %rs146, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs245, %rs149, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs248, %rs152, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs251, %rs155, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs254, %rs158, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs257, %rs161, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs260, %rs164, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs263, %rs167, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs266, %rs170, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs269, %rs173, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs272, %rs176, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs275, %rs179, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs278, %rs182, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs281, %rs185, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs284, %rs188, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs287, %rs191, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs290, %rs194, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs293, %rs197, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs296, %rs200, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs299, %rs203, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs302, %rs206, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs305, %rs209, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs308, %rs212, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs311, %rs215, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs314, %rs218, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs317, %rs221, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs320, %rs224, %rs135, c; }
{ .reg .b16 c;
mov.b16 c, 0x8000U;
fma.rn.bf16 %rs323, %rs227, %rs135, c; }
.loc 1 236 43
shl.b32 %r756, %r752, 1;
add.s32 %r758, %r250, %r756;
st.shared.u16 [%r758], %rs100;
bar.sync 0;
shl.b32 %r759, %r6, 1;
or.b32 %r761, %r18, %r759;
add.s32 %r762, %r250, %r761;
ld.shared.u16 %rs237, [%r762];
ld.shared.u16 %rs240, [%r762+2];
ld.shared.u16 %rs249, [%r762+32];
ld.shared.u16 %rs252, [%r762+34];
ld.shared.u16 %rs261, [%r762+64];
ld.shared.u16 %rs264, [%r762+66];
ld.shared.u16 %rs273, [%r762+96];
ld.shared.u16 %rs276, [%r762+98];
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs229, %rs230, c, %rs237; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs232, %rs233, c, %rs240; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs235, %rs236, c, %rs237; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs238, %rs239, c, %rs240; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs241, %rs242, c, %rs249; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs244, %rs245, c, %rs252; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs247, %rs248, c, %rs249; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs250, %rs251, c, %rs252; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs253, %rs254, c, %rs261; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs256, %rs257, c, %rs264; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs259, %rs260, c, %rs261; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs262, %rs263, c, %rs264; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs265, %rs266, c, %rs273; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs268, %rs269, c, %rs276; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs271, %rs272, c, %rs273; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs274, %rs275, c, %rs276; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs277, %rs278, c, %rs237; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs280, %rs281, c, %rs240; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs283, %rs284, c, %rs237; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs286, %rs287, c, %rs240; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs289, %rs290, c, %rs249; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs292, %rs293, c, %rs252; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs295, %rs296, c, %rs249; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs298, %rs299, c, %rs252; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs301, %rs302, c, %rs261; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs304, %rs305, c, %rs264; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs307, %rs308, c, %rs261; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs310, %rs311, c, %rs264; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs313, %rs314, c, %rs273; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs316, %rs317, c, %rs276; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs319, %rs320, c, %rs273; }
{ .reg .b16 c;
mov.b16 c, 0x3f80U;
fma.rn.bf16 %rs322, %rs323, c, %rs276; }
.loc 1 243 33
mul.lo.s32 %r763, %r10, %r122;
mul.lo.s32 %r764, %r11, %r122;
shl.b32 %r765, %r122, 4;
add.s32 %r766, %r763, %r765;
shl.b32 %r767, %r122, 3;
add.s32 %r768, %r766, %r767;
.loc 1 243 21
mul.wide.s32 %rd75, %r763, 2;
add.s64 %rd76, %rd31, %rd75;
mul.wide.s32 %rd77, %r764, 2;
add.s64 %rd78, %rd31, %rd77;
mul.wide.s32 %rd79, %r766, 2;
add.s64 %rd80, %rd31, %rd79;
mul.wide.s32 %rd81, %r768, 2;
add.s64 %rd82, %rd31, %rd81;
.loc 1 243 52
mul.wide.s32 %rd83, %r755, 2;
add.s64 %rd70, %rd76, %rd83;
add.s64 %rd71, %rd78, %rd83;
add.s64 %rd72, %rd80, %rd83;
add.s64 %rd73, %rd82, %rd83;
.loc 1 244 33
setp.lt.s32 %p18, %r10, %r119;
setp.lt.s32 %p19, %r11, %r119;
setp.lt.s32 %p20, %r12, %r119;
setp.lt.s32 %p21, %r13, %r119;
.loc 1 244 58
setp.lt.s32 %p22, %r755, %r120;
.loc 1 244 39
and.pred %p14, %p18, %p22;
and.pred %p15, %p19, %p22;
and.pred %p16, %p20, %p22;
and.pred %p17, %p21, %p22;
.loc 1 245 21
bar.sync 0;
mad.lo.s32 %r769, %r5, 144, %r762;
st.shared.v2.u16 [%r769], {%rs229, %rs232};
st.shared.v2.u16 [%r769+1152], {%rs235, %rs238};
st.shared.v2.u16 [%r769+32], {%rs241, %rs244};
st.shared.v2.u16 [%r769+1184], {%rs247, %rs250};
st.shared.v2.u16 [%r769+64], {%rs253, %rs256};
st.shared.v2.u16 [%r769+1216], {%rs259, %rs262};
st.shared.v2.u16 [%r769+96], {%rs265, %rs268};
st.shared.v2.u16 [%r769+1248], {%rs271, %rs274};
bar.sync 0;
mad.lo.s32 %r770, %r4, 72, %r7;
shl.b32 %r771, %r770, 1;
add.s32 %r772, %r250, %r771;
ld.shared.v4.u32 {%r736, %r737, %r738, %r739}, [%r772];
ld.shared.v4.u32 {%r740, %r741, %r742, %r743}, [%r772+1152];
bar.sync 0;
st.shared.v2.u16 [%r769], {%rs277, %rs280};
st.shared.v2.u16 [%r769+1152], {%rs283, %rs286};
st.shared.v2.u16 [%r769+32], {%rs289, %rs292};
st.shared.v2.u16 [%r769+1184], {%rs295, %rs298};
st.shared.v2.u16 [%r769+64], {%rs301, %rs304};
st.shared.v2.u16 [%r769+1216], {%rs307, %rs310};
st.shared.v2.u16 [%r769+96], {%rs313, %rs316};
st.shared.v2.u16 [%r769+1248], {%rs319, %rs322};
bar.sync 0;
ld.shared.v4.u32 {%r744, %r745, %r746, %r747}, [%r772];
ld.shared.v4.u32 {%r748, %r749, %r750, %r751}, [%r772+1152];
@%p14 st.global.v4.b32 [ %rd70 + 0 ], { %r736, %r737, %r738, %r739 };
@%p15 st.global.v4.b32 [ %rd71 + 0 ], { %r740, %r741, %r742, %r743 };
@%p16 st.global.v4.b32 [ %rd72 + 0 ], { %r744, %r745, %r746, %r747 };
@%p17 st.global.v4.b32 [ %rd73 + 0 ], { %r748, %r749, %r750, %r751 };
.loc 1 245 4
ret;
$L__tmp8:
$L__func_end0:
}
.file 1 "/home/cdhernandez/local/test.py"
.file 2 "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/triton/language/standard.py"
.file 3 "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/triton/language/core.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 330
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 116
.b8 101
.b8 115
.b8 116
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 104
.b8 111
.b8 109
.b8 101
.b8 47
.b8 99
.b8 100
.b8 104
.b8 101
.b8 114
.b8 110
.b8 97
.b8 110
.b8 100
.b8 101
.b8 122
.b8 47
.b8 108
.b8 111
.b8 99
.b8 97
.b8 108
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 105
.b8 110
.b8 116
.b8 56
.b8 95
.b8 119
.b8 101
.b8 105
.b8 103
.b8 104
.b8 116
.b8 95
.b8 111
.b8 110
.b8 108
.b8 121
.b8 95
.b8 108
.b8 105
.b8 110
.b8 101
.b8 97
.b8 114
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 57
.b8 99
.b8 49
.b8 48
.b8 99
.b8 49
.b8 49
.b8 100
.b8 49
.b8 50
.b8 99
.b8 49
.b8 51
.b8 100
.b8 49
.b8 52
.b8 99
.b8 0
.b8 105
.b8 110
.b8 116
.b8 56
.b8 95
.b8 119
.b8 101
.b8 105
.b8 103
.b8 104
.b8 116
.b8 95
.b8 111
.b8 110
.b8 108
.b8 121
.b8 95
.b8 108
.b8 105
.b8 110
.b8 101
.b8 97
.b8 114
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 57
.b8 99
.b8 49
.b8 48
.b8 99
.b8 49
.b8 49
.b8 100
.b8 49
.b8 50
.b8 99
.b8 49
.b8 51
.b8 100
.b8 49
.b8 52
.b8 99
.b8 0
.b8 1
.b8 167
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 74
.b8 4
.b32 74
.b64 $L__tmp1
.b64 $L__tmp2
.b8 2
.b8 191
.b8 27
.b8 4
.b32 74
.b64 $L__tmp2
.b64 $L__tmp3
.b8 2
.b8 192
.b8 27
.b8 4
.b32 74
.b64 $L__tmp4
.b64 $L__tmp5
.b8 3
.b8 196
.b8 48
.b8 4
.b32 74
.b64 $L__tmp6
.b64 $L__tmp7
.b8 2
.b8 223
.b8 33
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 334
.b32 74
.b8 105
.b8 110
.b8 116
.b8 56
.b8 95
.b8 119
.b8 101
.b8 105
.b8 103
.b8 104
.b8 116
.b8 95
.b8 111
.b8 110
.b8 108
.b8 121
.b8 95
.b8 108
.b8 105
.b8 110
.b8 101
.b8 97
.b8 114
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 57
.b8 99
.b8 49
.b8 48
.b8 99
.b8 49
.b8 49
.b8 100
.b8 49
.b8 50
.b8 99
.b8 49
.b8 51
.b8 100
.b8 49
.b8 52
.b8 99
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 334
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment