Created
July 27, 2023 19:34
-
-
Save HDCharles/00562275b4e360e2784a058c283d7f73 to your computer and use it in GitHub Desktop.
int8_weight_only_linear_kernel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // Generated by LLVM NVPTX Back-End | |
| // | |
| .version 8.1 | |
| .target sm_80 | |
| .address_size 64 | |
| // .globl int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c | |
| .extern .shared .align 1 .b8 global_smem[]; | |
| .visible .entry int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c( | |
| .param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_0, | |
| .param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_1, | |
| .param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_2, | |
| .param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_3, | |
| .param .u64 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_4, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_5, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_6, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_7, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_8, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_9, | |
| .param .u32 int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_10 | |
| ) | |
| .maxntid 64, 1, 1 | |
| { | |
| .reg .pred %p<23>; | |
| .reg .b16 %rs<357>; | |
| .reg .b32 %r<802>; | |
| .reg .f32 %f<481>; | |
| .reg .b64 %rd<89>; | |
| .loc 1 167 0 | |
| $L__func_begin0: | |
| .loc 1 167 0 | |
| ld.param.u32 %r122, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_10]; | |
| ld.param.u32 %r121, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_7]; | |
| ld.param.u32 %r120, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_6]; | |
| ld.param.u32 %r119, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_5]; | |
| ld.param.u64 %rd31, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_4]; | |
| ld.param.u64 %rd68, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_3]; | |
| ld.param.u64 %rd29, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_2]; | |
| ld.param.u64 %rd88, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_1]; | |
| ld.param.u64 %rd27, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_0]; | |
| $L__tmp0: | |
| .loc 1 207 51 | |
| mov.u32 %r1, %tid.x; | |
| and.b32 %r189, %r1, 31; | |
| bfe.u32 %r2, %r1, 5, 1; | |
| bfe.u32 %r3, %r1, 3, 2; | |
| shl.b32 %r190, %r2, 2; | |
| or.b32 %r4, %r190, %r3; | |
| ld.param.u32 %r191, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_8]; | |
| .loc 1 208 51 | |
| bfe.u32 %r5, %r1, 2, 3; | |
| ld.param.u32 %r192, [int8_weight_only_linear_kernel_0d1d2d3d4d5d6d7d8d9c10c11d12c13d14c_param_9]; | |
| shl.b32 %r6, %r2, 3; | |
| or.b32 %r193, %r6, %r5; | |
| and.b32 %r194, %r1, 7; | |
| shl.b32 %r7, %r194, 3; | |
| .loc 1 211 29 | |
| and.b32 %r8, %r1, 3; | |
| shl.b32 %r9, %r8, 4; | |
| .loc 1 190 24 | |
| mov.u32 %r195, %ctaid.x; | |
| $L__tmp1: | |
| .loc 2 21 0 | |
| add.s32 %r196, %r119, 31; | |
| .loc 2 21 28 | |
| shr.s32 %r197, %r196, 31; | |
| shr.u32 %r198, %r197, 27; | |
| add.s32 %r199, %r196, %r198; | |
| shr.s32 %r200, %r199, 5; | |
| $L__tmp2: | |
| .loc 2 21 0 | |
| add.s32 %r201, %r120, 63; | |
| .loc 2 21 28 | |
| shr.s32 %r202, %r201, 31; | |
| shr.u32 %r203, %r202, 26; | |
| add.s32 %r204, %r201, %r203; | |
| shr.s32 %r205, %r204, 6; | |
| $L__tmp3: | |
| .loc 1 193 38 | |
| shl.b32 %r207, %r205, 3; | |
| .loc 1 194 22 | |
| div.s32 %r208, %r195, %r207; | |
| .loc 1 195 29 | |
| shl.b32 %r209, %r208, 3; | |
| .loc 1 196 35 | |
| sub.s32 %r210, %r200, %r209; | |
| $L__tmp4: | |
| .loc 3 1385 27 | |
| min.s32 %r211, %r210, 8; | |
| $L__tmp5: | |
| .loc 1 197 33 | |
| rem.s32 %r212, %r195, %r211; | |
| .loc 1 197 27 | |
| add.s32 %r213, %r209, %r212; | |
| mul.lo.s32 %r214, %r208, %r207; | |
| sub.s32 %r215, %r195, %r214; | |
| .loc 1 198 40 | |
| div.s32 %r216, %r215, %r211; | |
| .loc 1 207 23 | |
| shl.b32 %r217, %r213, 5; | |
| .loc 1 207 38 | |
| or.b32 %r10, %r217, %r4; | |
| or.b32 %r11, %r10, 8; | |
| or.b32 %r12, %r10, 16; | |
| or.b32 %r13, %r10, 24; | |
| .loc 1 207 68 | |
| rem.s32 %r218, %r10, %r119; | |
| rem.s32 %r219, %r11, %r119; | |
| rem.s32 %r220, %r12, %r119; | |
| rem.s32 %r221, %r13, %r119; | |
| .loc 1 208 23 | |
| shl.b32 %r14, %r216, 6; | |
| .loc 1 208 38 | |
| or.b32 %r222, %r14, %r193; | |
| or.b32 %r223, %r222, 16; | |
| or.b32 %r224, %r222, 32; | |
| or.b32 %r225, %r222, 48; | |
| .loc 1 208 68 | |
| rem.s32 %r226, %r222, %r120; | |
| rem.s32 %r227, %r223, %r120; | |
| rem.s32 %r228, %r224, %r120; | |
| rem.s32 %r229, %r225, %r120; | |
| .loc 1 210 53 | |
| mad.lo.s32 %r230, %r218, %r191, %r7; | |
| mad.lo.s32 %r231, %r219, %r191, %r7; | |
| mad.lo.s32 %r232, %r220, %r191, %r7; | |
| mad.lo.s32 %r233, %r221, %r191, %r7; | |
| .loc 1 210 22 | |
| mul.wide.s32 %rd48, %r230, 2; | |
| add.s64 %rd32, %rd27, %rd48; | |
| mul.wide.s32 %rd49, %r231, 2; | |
| add.s64 %rd33, %rd27, %rd49; | |
| mul.wide.s32 %rd50, %r232, 2; | |
| add.s64 %rd34, %rd27, %rd50; | |
| mul.wide.s32 %rd51, %r233, 2; | |
| add.s64 %rd35, %rd27, %rd51; | |
| .loc 1 211 52 | |
| mad.lo.s32 %r234, %r226, %r192, %r9; | |
| mad.lo.s32 %r235, %r227, %r192, %r9; | |
| mad.lo.s32 %r236, %r228, %r192, %r9; | |
| mad.lo.s32 %r237, %r229, %r192, %r9; | |
| .loc 1 211 22 | |
| cvt.s64.s32 %rd5, %r234; | |
| add.s64 %rd36, %rd88, %rd5; | |
| cvt.s64.s32 %rd6, %r235; | |
| add.s64 %rd37, %rd88, %rd6; | |
| cvt.s64.s32 %rd7, %r236; | |
| add.s64 %rd38, %rd88, %rd7; | |
| cvt.s64.s32 %rd8, %r237; | |
| add.s64 %rd39, %rd88, %rd8; | |
| $L__tmp6: | |
| .loc 2 21 0 | |
| add.s32 %r238, %r121, 63; | |
| $L__tmp7: | |
| .loc 1 223 22 | |
| setp.lt.s32 %p1, %r238, 64; | |
| setp.gt.s32 %p2, %r238, 63; | |
| .loc 1 226 51 | |
| setp.lt.s32 %p3, %r7, %r121; | |
| .loc 1 226 20 | |
| xor.b32 %r16, %r4, %r194; | |
| shl.b32 %r242, %r16, 3; | |
| and.b32 %r243, %r242, 48; | |
| shl.b32 %r244, %r1, 3; | |
| and.b32 %r245, %r244, 8; | |
| or.b32 %r246, %r245, %r243; | |
| shl.b32 %r247, %r4, 7; | |
| shl.b32 %r248, %r246, 1; | |
| or.b32 %r249, %r247, %r248; | |
| mov.u32 %r250, global_smem; | |
| add.s32 %r123, %r250, %r249; | |
| add.s32 %r125, %r123, 1024; | |
| add.s32 %r127, %r123, 2048; | |
| add.s32 %r129, %r123, 3072; | |
| selp.b32 %r251, 16, 0, %p2; | |
| selp.b32 %r126, %r251, 0, %p3; | |
| cp.async.cg.shared.global [ %r123 + 0 ], [ %rd32 + 0 ], 0x10, %r126; | |
| cp.async.cg.shared.global [ %r125 + 0 ], [ %rd33 + 0 ], 0x10, %r126; | |
| cp.async.cg.shared.global [ %r127 + 0 ], [ %rd34 + 0 ], 0x10, %r126; | |
| cp.async.cg.shared.global [ %r129 + 0 ], [ %rd35 + 0 ], 0x10, %r126; | |
| cp.async.commit_group ; | |
| .loc 1 227 51 | |
| setp.lt.s32 %p4, %r9, %r121; | |
| .loc 1 227 20 | |
| shr.u32 %r252, %r1, 3; | |
| shl.b32 %r253, %r193, 6; | |
| xor.b32 %r254, %r252, %r1; | |
| shl.b32 %r255, %r254, 4; | |
| and.b32 %r17, %r255, 48; | |
| or.b32 %r256, %r253, %r17; | |
| add.s32 %r794, %r250, 12288; | |
| add.s32 %r131, %r794, %r256; | |
| add.s32 %r133, %r131, 1024; | |
| add.s32 %r135, %r131, 2048; | |
| add.s32 %r137, %r131, 3072; | |
| selp.b32 %r134, %r251, 0, %p4; | |
| cp.async.cg.shared.global [ %r131 + 0 ], [ %rd36 + 0 ], 0x10, %r134; | |
| cp.async.cg.shared.global [ %r133 + 0 ], [ %rd37 + 0 ], 0x10, %r134; | |
| cp.async.cg.shared.global [ %r135 + 0 ], [ %rd38 + 0 ], 0x10, %r134; | |
| cp.async.cg.shared.global [ %r137 + 0 ], [ %rd39 + 0 ], 0x10, %r134; | |
| cp.async.commit_group ; | |
| .loc 1 231 18 | |
| add.s64 %rd40, %rd32, 128; | |
| add.s64 %rd41, %rd33, 128; | |
| add.s64 %rd42, %rd34, 128; | |
| add.s64 %rd43, %rd35, 128; | |
| .loc 1 232 18 | |
| add.s64 %rd44, %rd36, 64; | |
| add.s64 %rd45, %rd37, 64; | |
| add.s64 %rd46, %rd38, 64; | |
| add.s64 %rd47, %rd39, 64; | |
| .loc 1 223 22 | |
| setp.gt.s32 %p5, %r238, 127; | |
| .loc 1 226 55 | |
| add.s32 %r258, %r121, -64; | |
| .loc 1 226 51 | |
| setp.lt.s32 %p6, %r7, %r258; | |
| .loc 1 226 20 | |
| bar.sync 0; | |
| add.s32 %r139, %r123, 4096; | |
| add.s32 %r141, %r123, 5120; | |
| add.s32 %r143, %r123, 6144; | |
| add.s32 %r145, %r123, 7168; | |
| selp.b32 %r259, 16, 0, %p5; | |
| selp.b32 %r142, %r259, 0, %p6; | |
| cp.async.cg.shared.global [ %r139 + 0 ], [ %rd40 + 0 ], 0x10, %r142; | |
| cp.async.cg.shared.global [ %r141 + 0 ], [ %rd41 + 0 ], 0x10, %r142; | |
| cp.async.cg.shared.global [ %r143 + 0 ], [ %rd42 + 0 ], 0x10, %r142; | |
| cp.async.cg.shared.global [ %r145 + 0 ], [ %rd43 + 0 ], 0x10, %r142; | |
| cp.async.commit_group ; | |
| .loc 1 227 51 | |
| setp.lt.s32 %p7, %r9, %r258; | |
| .loc 1 227 20 | |
| add.s32 %r260, %r250, %r256; | |
| add.s32 %r147, %r260, 16384; | |
| add.s32 %r149, %r260, 17408; | |
| add.s32 %r151, %r260, 18432; | |
| add.s32 %r153, %r260, 19456; | |
| selp.b32 %r150, %r259, 0, %p7; | |
| cp.async.cg.shared.global [ %r147 + 0 ], [ %rd44 + 0 ], 0x10, %r150; | |
| cp.async.cg.shared.global [ %r149 + 0 ], [ %rd45 + 0 ], 0x10, %r150; | |
| cp.async.cg.shared.global [ %r151 + 0 ], [ %rd46 + 0 ], 0x10, %r150; | |
| cp.async.cg.shared.global [ %r153 + 0 ], [ %rd47 + 0 ], 0x10, %r150; | |
| cp.async.commit_group ; | |
| .loc 1 226 20 | |
| cp.async.wait_group 0x2; | |
| bar.sync 0; | |
| shl.b32 %r18, %r8, 2; | |
| .loc 1 227 20 | |
| bfe.u32 %r38, %r189, 3, 1; | |
| bfe.u32 %r39, %r1, 1, 2; | |
| and.b32 %r40, %r1, 23; | |
| or.b32 %r41, %r40, %r6; | |
| xor.b32 %r267, %r38, %r39; | |
| shl.b32 %r268, %r41, 6; | |
| shl.b32 %r42, %r267, 4; | |
| or.b32 %r269, %r268, %r42; | |
| add.s32 %r159, %r794, %r269; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r155, %r156, %r157, %r158 }, [ %r159 + 0 ]; | |
| add.s32 %r164, %r159, 2048; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r160, %r161, %r162, %r163 }, [ %r164 + 0 ]; | |
| mov.f32 %f417, 0f00000000; | |
| mov.f32 %f418, %f417; | |
| mov.f32 %f419, %f417; | |
| mov.f32 %f420, %f417; | |
| mov.f32 %f421, %f417; | |
| mov.f32 %f422, %f417; | |
| mov.f32 %f423, %f417; | |
| mov.f32 %f424, %f417; | |
| mov.f32 %f425, %f417; | |
| mov.f32 %f426, %f417; | |
| mov.f32 %f427, %f417; | |
| mov.f32 %f428, %f417; | |
| mov.f32 %f429, %f417; | |
| mov.f32 %f430, %f417; | |
| mov.f32 %f431, %f417; | |
| mov.f32 %f432, %f417; | |
| mov.f32 %f433, %f417; | |
| mov.f32 %f434, %f417; | |
| mov.f32 %f435, %f417; | |
| mov.f32 %f436, %f417; | |
| mov.f32 %f437, %f417; | |
| mov.f32 %f438, %f417; | |
| mov.f32 %f439, %f417; | |
| mov.f32 %f440, %f417; | |
| mov.f32 %f441, %f417; | |
| mov.f32 %f442, %f417; | |
| mov.f32 %f443, %f417; | |
| mov.f32 %f444, %f417; | |
| mov.f32 %f445, %f417; | |
| mov.f32 %f446, %f417; | |
| mov.f32 %f447, %f417; | |
| mov.f32 %f448, %f417; | |
| .loc 1 223 22 | |
| @%p1 bra $L__BB0_3; | |
| .loc 1 0 0 | |
| cvt.s64.s32 %rd1, %r230; | |
| cvt.s64.s32 %rd2, %r231; | |
| cvt.s64.s32 %rd3, %r232; | |
| cvt.s64.s32 %rd4, %r233; | |
| shr.s32 %r239, %r238, 31; | |
| shr.u32 %r240, %r239, 26; | |
| add.s32 %r241, %r238, %r240; | |
| shr.s32 %r15, %r241, 6; | |
| shl.b32 %r261, %r3, 4; | |
| shl.b32 %r19, %r5, 6; | |
| or.b32 %r262, %r261, %r19; | |
| or.b32 %r20, %r262, %r18; | |
| shl.b32 %r263, %r20, 1; | |
| add.s32 %r264, %r250, %r263; | |
| ld.shared.v2.u32 {%r791, %r790}, [%r264]; | |
| ld.shared.v2.u32 {%r789, %r788}, [%r264+1024]; | |
| xor.b32 %r25, %r20, 16; | |
| shl.b32 %r265, %r25, 1; | |
| add.s32 %r266, %r250, %r265; | |
| ld.shared.v2.u32 {%r787, %r786}, [%r266]; | |
| ld.shared.v2.u32 {%r785, %r784}, [%r266+1024]; | |
| ld.shared.v2.u32 {%r783, %r782}, [%r264+2048]; | |
| ld.shared.v2.u32 {%r781, %r780}, [%r264+3072]; | |
| ld.shared.v2.u32 {%r779, %r778}, [%r266+2048]; | |
| ld.shared.v2.u32 {%r777, %r776}, [%r266+3072]; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r155; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r165, f0, f1, 0x7632; | |
| prmt.b32 %r166, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs356}, %r166; } | |
| cvt.u16.u32 %rs355, %r166; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs354}, %r165; } | |
| cvt.u16.u32 %rs353, %r165; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r156; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r168, f0, f1, 0x7632; | |
| prmt.b32 %r169, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs348}, %r169; } | |
| cvt.u16.u32 %rs347, %r169; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs346}, %r168; } | |
| cvt.u16.u32 %rs345, %r168; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r157; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r171, f0, f1, 0x7632; | |
| prmt.b32 %r172, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs352}, %r172; } | |
| cvt.u16.u32 %rs351, %r172; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs350}, %r171; } | |
| cvt.u16.u32 %rs349, %r171; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r158; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r174, f0, f1, 0x7632; | |
| prmt.b32 %r175, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs344}, %r175; } | |
| cvt.u16.u32 %rs343, %r175; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs342}, %r174; } | |
| cvt.u16.u32 %rs341, %r174; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r160; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r177, f0, f1, 0x7632; | |
| prmt.b32 %r178, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs340}, %r178; } | |
| cvt.u16.u32 %rs339, %r178; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs338}, %r177; } | |
| cvt.u16.u32 %rs337, %r177; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r161; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r180, f0, f1, 0x7632; | |
| prmt.b32 %r181, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs332}, %r181; } | |
| cvt.u16.u32 %rs331, %r181; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs330}, %r180; } | |
| cvt.u16.u32 %rs329, %r180; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r162; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r183, f0, f1, 0x7632; | |
| prmt.b32 %r184, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs336}, %r184; } | |
| cvt.u16.u32 %rs335, %r184; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs334}, %r183; } | |
| cvt.u16.u32 %rs333, %r183; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r163; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r186, f0, f1, 0x7632; | |
| prmt.b32 %r187, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs328}, %r187; } | |
| cvt.u16.u32 %rs327, %r187; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs326}, %r186; } | |
| cvt.u16.u32 %rs325, %r186; | |
| .loc 1 223 22 | |
| shl.b32 %r284, %r2, 9; | |
| shl.b32 %r285, %r40, 6; | |
| .loc 1 229 38 | |
| or.b32 %r286, %r285, %r42; | |
| or.b32 %r43, %r286, 18432; | |
| .loc 1 223 22 | |
| add.s32 %r44, %r250, %r284; | |
| .loc 1 229 38 | |
| or.b32 %r45, %r286, 16384; | |
| .loc 1 223 22 | |
| add.s32 %r773, %r121, -128; | |
| .loc 1 229 38 | |
| add.s32 %r289, %r19, %r17; | |
| or.b32 %r49, %r289, 20480; | |
| or.b32 %r50, %r289, 23552; | |
| or.b32 %r51, %r289, 22528; | |
| or.b32 %r52, %r289, 21504; | |
| .loc 1 223 22 | |
| shl.b32 %r290, %r3, 7; | |
| shl.b32 %r291, %r16, 4; | |
| and.b32 %r292, %r291, 96; | |
| .loc 1 229 38 | |
| or.b32 %r293, %r290, %r292; | |
| .loc 1 223 22 | |
| cvt.u16.u32 %rs97, %r1; | |
| and.b16 %rs98, %rs97, 1; | |
| mul.wide.u16 %r294, %rs98, 16; | |
| .loc 1 229 38 | |
| or.b32 %r295, %r293, %r294; | |
| or.b32 %r53, %r295, 8192; | |
| or.b32 %r54, %r295, 11264; | |
| or.b32 %r55, %r295, 10240; | |
| or.b32 %r56, %r295, 9216; | |
| .loc 1 223 22 | |
| add.s64 %rd9, %rd8, 128; | |
| add.s64 %rd10, %rd7, 128; | |
| add.s64 %rd11, %rd6, 128; | |
| add.s64 %rd12, %rd5, 128; | |
| shl.b64 %rd52, %rd4, 1; | |
| add.s64 %rd53, %rd52, %rd27; | |
| add.s64 %rd87, %rd53, 256; | |
| shl.b64 %rd54, %rd3, 1; | |
| add.s64 %rd55, %rd54, %rd27; | |
| add.s64 %rd86, %rd55, 256; | |
| shl.b64 %rd56, %rd2, 1; | |
| add.s64 %rd57, %rd56, %rd27; | |
| add.s64 %rd85, %rd57, 256; | |
| shl.b64 %rd58, %rd1, 1; | |
| add.s64 %rd59, %rd58, %rd27; | |
| add.s64 %rd84, %rd59, 256; | |
| mov.f32 %f417, 0f00000000; | |
| mov.u32 %r775, 0; | |
| mov.u32 %r800, 1; | |
| mov.u32 %r799, 64; | |
| mov.u32 %r793, 2; | |
| mov.u32 %r774, 16384; | |
| mov.u32 %r792, %r800; | |
| mov.u32 %r801, %r775; | |
| mov.u32 %r798, %r250; | |
| mov.f32 %f418, %f417; | |
| mov.f32 %f419, %f417; | |
| mov.f32 %f420, %f417; | |
| mov.f32 %f421, %f417; | |
| mov.f32 %f422, %f417; | |
| mov.f32 %f423, %f417; | |
| mov.f32 %f424, %f417; | |
| mov.f32 %f425, %f417; | |
| mov.f32 %f426, %f417; | |
| mov.f32 %f427, %f417; | |
| mov.f32 %f428, %f417; | |
| mov.f32 %f429, %f417; | |
| mov.f32 %f430, %f417; | |
| mov.f32 %f431, %f417; | |
| mov.f32 %f432, %f417; | |
| mov.f32 %f433, %f417; | |
| mov.f32 %f434, %f417; | |
| mov.f32 %f435, %f417; | |
| mov.f32 %f436, %f417; | |
| mov.f32 %f437, %f417; | |
| mov.f32 %f438, %f417; | |
| mov.f32 %f439, %f417; | |
| mov.f32 %f440, %f417; | |
| mov.f32 %f441, %f417; | |
| mov.f32 %f442, %f417; | |
| mov.f32 %f443, %f417; | |
| mov.f32 %f444, %f417; | |
| mov.f32 %f445, %f417; | |
| mov.f32 %f446, %f417; | |
| mov.f32 %f447, %f417; | |
| mov.f32 %f448, %f417; | |
| $L__BB0_2: | |
| .loc 1 229 38 | |
| mul.hi.u32 %r572, %r792, -1431655765; | |
| shr.u32 %r573, %r572, 1; | |
| mul.lo.s32 %r574, %r573, 12288; | |
| sub.s32 %r575, %r43, %r574; | |
| sub.s32 %r576, %r45, %r574; | |
| sub.s32 %r577, %r774, %r574; | |
| mul.hi.u32 %r578, %r793, -1431655765; | |
| shr.u32 %r579, %r578, 1; | |
| mul.lo.s32 %r580, %r579, 12288; | |
| sub.s32 %r581, %r49, %r580; | |
| sub.s32 %r582, %r50, %r580; | |
| sub.s32 %r583, %r51, %r580; | |
| sub.s32 %r584, %r52, %r580; | |
| sub.s32 %r585, %r53, %r580; | |
| sub.s32 %r586, %r54, %r580; | |
| sub.s32 %r587, %r55, %r580; | |
| sub.s32 %r588, %r56, %r580; | |
| cvt.u32.u16 %r589, %rs325; | |
| cvt.u32.u16 %r590, %rs326; | |
| shl.b32 %r591, %r590, 16; | |
| or.b32 %r424, %r589, %r591; | |
| cvt.u32.u16 %r592, %rs327; | |
| cvt.u32.u16 %r593, %rs328; | |
| shl.b32 %r594, %r593, 16; | |
| or.b32 %r425, %r592, %r594; | |
| cvt.u32.u16 %r595, %rs329; | |
| cvt.u32.u16 %r596, %rs330; | |
| shl.b32 %r597, %r596, 16; | |
| or.b32 %r418, %r595, %r597; | |
| cvt.u32.u16 %r598, %rs331; | |
| cvt.u32.u16 %r599, %rs332; | |
| shl.b32 %r600, %r599, 16; | |
| or.b32 %r419, %r598, %r600; | |
| cvt.u32.u16 %r601, %rs333; | |
| cvt.u32.u16 %r602, %rs334; | |
| shl.b32 %r603, %r602, 16; | |
| or.b32 %r376, %r601, %r603; | |
| cvt.u32.u16 %r604, %rs335; | |
| cvt.u32.u16 %r605, %rs336; | |
| shl.b32 %r606, %r605, 16; | |
| or.b32 %r377, %r604, %r606; | |
| cvt.u32.u16 %r607, %rs337; | |
| cvt.u32.u16 %r608, %rs338; | |
| shl.b32 %r609, %r608, 16; | |
| or.b32 %r370, %r607, %r609; | |
| cvt.u32.u16 %r610, %rs339; | |
| cvt.u32.u16 %r611, %rs340; | |
| shl.b32 %r612, %r611, 16; | |
| or.b32 %r371, %r610, %r612; | |
| cvt.u32.u16 %r613, %rs341; | |
| cvt.u32.u16 %r614, %rs342; | |
| shl.b32 %r615, %r614, 16; | |
| or.b32 %r412, %r613, %r615; | |
| cvt.u32.u16 %r616, %rs343; | |
| cvt.u32.u16 %r617, %rs344; | |
| shl.b32 %r618, %r617, 16; | |
| or.b32 %r413, %r616, %r618; | |
| cvt.u32.u16 %r619, %rs345; | |
| cvt.u32.u16 %r620, %rs346; | |
| shl.b32 %r621, %r620, 16; | |
| or.b32 %r406, %r619, %r621; | |
| cvt.u32.u16 %r622, %rs347; | |
| cvt.u32.u16 %r623, %rs348; | |
| shl.b32 %r624, %r623, 16; | |
| or.b32 %r407, %r622, %r624; | |
| cvt.u32.u16 %r625, %rs349; | |
| cvt.u32.u16 %r626, %rs350; | |
| shl.b32 %r627, %r626, 16; | |
| or.b32 %r364, %r625, %r627; | |
| cvt.u32.u16 %r628, %rs351; | |
| cvt.u32.u16 %r629, %rs352; | |
| shl.b32 %r630, %r629, 16; | |
| or.b32 %r365, %r628, %r630; | |
| cvt.u32.u16 %r631, %rs353; | |
| cvt.u32.u16 %r632, %rs354; | |
| shl.b32 %r633, %r632, 16; | |
| or.b32 %r358, %r631, %r633; | |
| cvt.u32.u16 %r634, %rs355; | |
| cvt.u32.u16 %r635, %rs356; | |
| shl.b32 %r636, %r635, 16; | |
| or.b32 %r359, %r634, %r636; | |
| .loc 1 226 20 | |
| add.s32 %r637, %r801, 32; | |
| shl.b32 %r638, %r800, 6; | |
| add.s32 %r639, %r798, %r638; | |
| shr.u32 %r640, %r637, 4; | |
| xor.b32 %r641, %r640, %r3; | |
| shl.b32 %r642, %r641, 4; | |
| mad.lo.s32 %r643, %r799, %r5, %r18; | |
| add.s32 %r644, %r643, %r642; | |
| mov.u32 %r645, -32; | |
| sub.s32 %r646, %r645, %r801; | |
| shl.b32 %r647, %r646, 1; | |
| add.s32 %r648, %r639, %r647; | |
| shl.b32 %r649, %r644, 1; | |
| add.s32 %r650, %r648, %r649; | |
| shl.b32 %r651, %r799, 4; | |
| add.s32 %r652, %r650, %r651; | |
| ld.shared.v2.u32 {%r426, %r428}, [%r650]; | |
| ld.shared.v2.u32 {%r427, %r429}, [%r652]; | |
| add.s32 %r653, %r640, 1; | |
| xor.b32 %r654, %r653, %r3; | |
| shl.b32 %r655, %r654, 4; | |
| add.s32 %r656, %r643, %r655; | |
| shl.b32 %r657, %r656, 1; | |
| add.s32 %r658, %r648, %r657; | |
| add.s32 %r659, %r658, %r651; | |
| ld.shared.v2.u32 {%r474, %r476}, [%r658]; | |
| ld.shared.v2.u32 {%r475, %r477}, [%r659]; | |
| shl.b32 %r660, %r799, 5; | |
| add.s32 %r661, %r650, %r660; | |
| mul.lo.s32 %r662, %r799, 48; | |
| add.s32 %r663, %r650, %r662; | |
| ld.shared.v2.u32 {%r450, %r452}, [%r661]; | |
| ld.shared.v2.u32 {%r451, %r453}, [%r663]; | |
| add.s32 %r664, %r658, %r660; | |
| add.s32 %r665, %r658, %r662; | |
| ld.shared.v2.u32 {%r498, %r500}, [%r664]; | |
| ld.shared.v2.u32 {%r499, %r501}, [%r665]; | |
| .loc 1 227 20 | |
| add.s32 %r666, %r801, 32; | |
| shl.b32 %r667, %r800, 5; | |
| add.s32 %r668, %r794, %r667; | |
| shr.u32 %r669, %r666, 4; | |
| add.s32 %r670, %r669, %r38; | |
| xor.b32 %r671, %r670, %r39; | |
| shl.b32 %r672, %r671, 4; | |
| mad.lo.s32 %r673, %r799, %r41, %r672; | |
| sub.s32 %r674, %r668, %r801; | |
| add.s32 %r675, %r674, %r673; | |
| add.s32 %r300, %r675, -32; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r296, %r297, %r298, %r299 }, [ %r300 + 0 ]; | |
| shl.b32 %r676, %r799, 5; | |
| add.s32 %r305, %r300, %r676; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r301, %r302, %r303, %r304 }, [ %r305 + 0 ]; | |
| .loc 1 229 38 | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r296; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r430, f0, f1, 0x7632; | |
| prmt.b32 %r431, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r297; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r478, f0, f1, 0x7632; | |
| prmt.b32 %r479, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r298; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r436, f0, f1, 0x7632; | |
| prmt.b32 %r437, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r299; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r484, f0, f1, 0x7632; | |
| prmt.b32 %r485, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r301; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r442, f0, f1, 0x7632; | |
| prmt.b32 %r443, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r302; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r490, f0, f1, 0x7632; | |
| prmt.b32 %r491, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r303; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r448, f0, f1, 0x7632; | |
| prmt.b32 %r449, f2, f3, 0x7632; | |
| } | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r304; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r496, f0, f1, 0x7632; | |
| prmt.b32 %r497, f2, f3, 0x7632; | |
| } | |
| .loc 1 229 0 | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r791, %r789, %r790, %r788 }, { %r358, %r359 }, { %f417, %f418, %f419, %f420 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r791, %r789, %r790, %r788 }, { %r364, %r365 }, { %f421, %f422, %f423, %f424 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r791, %r789, %r790, %r788 }, { %r370, %r371 }, { %f425, %f426, %f427, %f428 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r791, %r789, %r790, %r788 }, { %r376, %r377 }, { %f429, %f430, %f431, %f432 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r783, %r781, %r782, %r780 }, { %r358, %r359 }, { %f433, %f434, %f435, %f436 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r783, %r781, %r782, %r780 }, { %r364, %r365 }, { %f437, %f438, %f439, %f440 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r783, %r781, %r782, %r780 }, { %r370, %r371 }, { %f441, %f442, %f443, %f444 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r783, %r781, %r782, %r780 }, { %r376, %r377 }, { %f445, %f446, %f447, %f448 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r787, %r785, %r786, %r784 }, { %r406, %r407 }, { %f417, %f418, %f419, %f420 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r787, %r785, %r786, %r784 }, { %r412, %r413 }, { %f421, %f422, %f423, %f424 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r787, %r785, %r786, %r784 }, { %r418, %r419 }, { %f425, %f426, %f427, %f428 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r787, %r785, %r786, %r784 }, { %r424, %r425 }, { %f429, %f430, %f431, %f432 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r779, %r777, %r778, %r776 }, { %r406, %r407 }, { %f433, %f434, %f435, %f436 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r779, %r777, %r778, %r776 }, { %r412, %r413 }, { %f437, %f438, %f439, %f440 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r779, %r777, %r778, %r776 }, { %r418, %r419 }, { %f441, %f442, %f443, %f444 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r779, %r777, %r778, %r776 }, { %r424, %r425 }, { %f445, %f446, %f447, %f448 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r426, %r427, %r428, %r429 }, { %r430, %r431 }, { %f417, %f418, %f419, %f420 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r426, %r427, %r428, %r429 }, { %r436, %r437 }, { %f421, %f422, %f423, %f424 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r426, %r427, %r428, %r429 }, { %r442, %r443 }, { %f425, %f426, %f427, %f428 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r426, %r427, %r428, %r429 }, { %r448, %r449 }, { %f429, %f430, %f431, %f432 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r450, %r451, %r452, %r453 }, { %r430, %r431 }, { %f433, %f434, %f435, %f436 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r450, %r451, %r452, %r453 }, { %r436, %r437 }, { %f437, %f438, %f439, %f440 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r450, %r451, %r452, %r453 }, { %r442, %r443 }, { %f441, %f442, %f443, %f444 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r450, %r451, %r452, %r453 }, { %r448, %r449 }, { %f445, %f446, %f447, %f448 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f417, %f418, %f419, %f420 }, { %r474, %r475, %r476, %r477 }, { %r478, %r479 }, { %f417, %f418, %f419, %f420 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f421, %f422, %f423, %f424 }, { %r474, %r475, %r476, %r477 }, { %r484, %r485 }, { %f421, %f422, %f423, %f424 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f425, %f426, %f427, %f428 }, { %r474, %r475, %r476, %r477 }, { %r490, %r491 }, { %f425, %f426, %f427, %f428 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f429, %f430, %f431, %f432 }, { %r474, %r475, %r476, %r477 }, { %r496, %r497 }, { %f429, %f430, %f431, %f432 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f433, %f434, %f435, %f436 }, { %r498, %r499, %r500, %r501 }, { %r478, %r479 }, { %f433, %f434, %f435, %f436 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f437, %f438, %f439, %f440 }, { %r498, %r499, %r500, %r501 }, { %r484, %r485 }, { %f437, %f438, %f439, %f440 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f441, %f442, %f443, %f444 }, { %r498, %r499, %r500, %r501 }, { %r490, %r491 }, { %f441, %f442, %f443, %f444 }; | |
| mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 { %f445, %f446, %f447, %f448 }, { %r498, %r499, %r500, %r501 }, { %r496, %r497 }, { %f445, %f446, %f447, %f448 }; | |
| .loc 1 223 22 | |
| add.s32 %r792, %r792, 1; | |
| setp.lt.s32 %p8, %r793, %r15; | |
| .loc 1 226 51 | |
| setp.lt.s32 %p9, %r7, %r773; | |
| .loc 1 227 51 | |
| setp.lt.s32 %p10, %r9, %r773; | |
| .loc 1 232 18 | |
| add.s64 %rd64, %rd88, %rd12; | |
| add.s64 %rd65, %rd88, %rd11; | |
| add.s64 %rd66, %rd88, %rd10; | |
| .loc 1 226 20 | |
| add.s64 %rd67, %rd88, %rd9; | |
| bar.sync 0; | |
| add.s32 %r677, %r775, %r585; | |
| add.s32 %r522, %r44, %r677; | |
| add.s32 %r678, %r775, %r588; | |
| add.s32 %r524, %r44, %r678; | |
| add.s32 %r679, %r775, %r587; | |
| add.s32 %r526, %r44, %r679; | |
| add.s32 %r680, %r775, %r586; | |
| add.s32 %r528, %r44, %r680; | |
| selp.b32 %r681, 16, 0, %p9; | |
| selp.b32 %r525, %r681, 0, %p8; | |
| cp.async.cg.shared.global [ %r522 + 0 ], [ %rd84 + 0 ], 0x10, %r525; | |
| cp.async.cg.shared.global [ %r524 + 0 ], [ %rd85 + 0 ], 0x10, %r525; | |
| cp.async.cg.shared.global [ %r526 + 0 ], [ %rd86 + 0 ], 0x10, %r525; | |
| cp.async.cg.shared.global [ %r528 + 0 ], [ %rd87 + 0 ], 0x10, %r525; | |
| cp.async.commit_group ; | |
| .loc 1 227 20 | |
| add.s32 %r682, %r775, %r581; | |
| add.s32 %r530, %r44, %r682; | |
| add.s32 %r683, %r775, %r584; | |
| add.s32 %r532, %r44, %r683; | |
| add.s32 %r684, %r775, %r583; | |
| add.s32 %r534, %r44, %r684; | |
| add.s32 %r685, %r775, %r582; | |
| add.s32 %r536, %r44, %r685; | |
| selp.b32 %r686, 16, 0, %p10; | |
| selp.b32 %r533, %r686, 0, %p8; | |
| cp.async.cg.shared.global [ %r530 + 0 ], [ %rd64 + 0 ], 0x10, %r533; | |
| cp.async.cg.shared.global [ %r532 + 0 ], [ %rd65 + 0 ], 0x10, %r533; | |
| cp.async.cg.shared.global [ %r534 + 0 ], [ %rd66 + 0 ], 0x10, %r533; | |
| cp.async.cg.shared.global [ %r536 + 0 ], [ %rd67 + 0 ], 0x10, %r533; | |
| cp.async.commit_group ; | |
| .loc 1 226 20 | |
| cp.async.wait_group 0x2; | |
| bar.sync 0; | |
| sub.s32 %r687, %r775, %r574; | |
| add.s32 %r688, %r687, 4096; | |
| add.s32 %r798, %r250, %r688; | |
| mov.u32 %r801, 0; | |
| .loc 1 227 20 | |
| add.s32 %r794, %r250, %r577; | |
| .loc 1 223 22 | |
| add.s32 %r99, %r793, 1; | |
| .loc 1 226 20 | |
| add.s32 %r690, %r264, %r688; | |
| add.s32 %r691, %r687, 5120; | |
| add.s32 %r692, %r264, %r691; | |
| ld.shared.v2.u32 {%r791, %r790}, [%r690]; | |
| ld.shared.v2.u32 {%r789, %r788}, [%r692]; | |
| add.s32 %r693, %r266, %r688; | |
| add.s32 %r694, %r266, %r691; | |
| ld.shared.v2.u32 {%r787, %r786}, [%r693]; | |
| ld.shared.v2.u32 {%r785, %r784}, [%r694]; | |
| add.s32 %r695, %r687, 6144; | |
| add.s32 %r696, %r264, %r695; | |
| add.s32 %r697, %r687, 7168; | |
| add.s32 %r698, %r264, %r697; | |
| ld.shared.v2.u32 {%r783, %r782}, [%r696]; | |
| ld.shared.v2.u32 {%r781, %r780}, [%r698]; | |
| add.s32 %r699, %r266, %r695; | |
| add.s32 %r700, %r266, %r697; | |
| ld.shared.v2.u32 {%r779, %r778}, [%r699]; | |
| ld.shared.v2.u32 {%r777, %r776}, [%r700]; | |
| .loc 1 227 20 | |
| add.s32 %r701, %r775, %r576; | |
| add.s32 %r542, %r44, %r701; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r538, %r539, %r540, %r541 }, [ %r542 + 0 ]; | |
| add.s32 %r702, %r775, %r575; | |
| add.s32 %r547, %r44, %r702; | |
| ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r543, %r544, %r545, %r546 }, [ %r547 + 0 ]; | |
| .loc 1 229 38 | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r538; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r548, f0, f1, 0x7632; | |
| prmt.b32 %r549, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs356}, %r549; } | |
| cvt.u16.u32 %rs355, %r549; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs354}, %r548; } | |
| cvt.u16.u32 %rs353, %r548; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r539; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r551, f0, f1, 0x7632; | |
| prmt.b32 %r552, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs348}, %r552; } | |
| cvt.u16.u32 %rs347, %r552; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs346}, %r551; } | |
| cvt.u16.u32 %rs345, %r551; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r540; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r554, f0, f1, 0x7632; | |
| prmt.b32 %r555, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs352}, %r555; } | |
| cvt.u16.u32 %rs351, %r555; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs350}, %r554; } | |
| cvt.u16.u32 %rs349, %r554; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r541; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r557, f0, f1, 0x7632; | |
| prmt.b32 %r558, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs344}, %r558; } | |
| cvt.u16.u32 %rs343, %r558; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs342}, %r557; } | |
| cvt.u16.u32 %rs341, %r557; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r543; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r560, f0, f1, 0x7632; | |
| prmt.b32 %r561, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs340}, %r561; } | |
| cvt.u16.u32 %rs339, %r561; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs338}, %r560; } | |
| cvt.u16.u32 %rs337, %r560; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r544; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r563, f0, f1, 0x7632; | |
| prmt.b32 %r564, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs332}, %r564; } | |
| cvt.u16.u32 %rs331, %r564; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs330}, %r563; } | |
| cvt.u16.u32 %rs329, %r563; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r545; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r566, f0, f1, 0x7632; | |
| prmt.b32 %r567, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs336}, %r567; } | |
| cvt.u16.u32 %rs335, %r567; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs334}, %r566; } | |
| cvt.u16.u32 %rs333, %r566; | |
| { | |
| .reg .s8 s<4>; | |
| .reg .f32 f<4>; | |
| mov.b32 {s0, s1, s2, s3}, %r546; | |
| cvt.rn.f32.s8 f0, s0; | |
| cvt.rn.f32.s8 f1, s1; | |
| cvt.rn.f32.s8 f2, s2; | |
| cvt.rn.f32.s8 f3, s3; | |
| prmt.b32 %r569, f0, f1, 0x7632; | |
| prmt.b32 %r570, f2, f3, 0x7632; | |
| } | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs328}, %r570; } | |
| cvt.u16.u32 %rs327, %r570; | |
| { .reg .b16 tmp; mov.b32 {tmp, %rs326}, %r569; } | |
| cvt.u16.u32 %rs325, %r569; | |
| .loc 1 223 22 | |
| add.s32 %r775, %r775, 4096; | |
| add.s32 %r774, %r774, 4096; | |
| add.s32 %r773, %r773, -64; | |
| add.s64 %rd88, %rd88, 64; | |
| add.s64 %rd87, %rd87, 128; | |
| add.s64 %rd86, %rd86, 128; | |
| add.s64 %rd85, %rd85, 128; | |
| add.s64 %rd84, %rd84, 128; | |
| add.s32 %r703, %r793, -1; | |
| setp.lt.s32 %p11, %r703, %r15; | |
| mov.u32 %r793, %r99; | |
| @%p11 bra $L__BB0_2; | |
| $L__BB0_3: | |
| .loc 1 208 51 | |
| and.b32 %r752, %r1, 63; | |
| .loc 1 208 38 | |
| or.b32 %r753, %r14, %r752; | |
| .loc 1 208 68 | |
| rem.s32 %r754, %r753, %r120; | |
| .loc 1 212 22 | |
| mul.wide.s32 %rd74, %r754, 2; | |
| add.s64 %rd69, %rd29, %rd74; | |
| .loc 1 208 38 | |
| or.b32 %r755, %r14, %r7; | |
| .loc 1 223 22 | |
| cp.async.wait_group 0x0; | |
| bar.sync 0; | |
| mov.pred %p12, -1; | |
| .loc 1 234 16 | |
| mov.u16 %rs135, 0x0; | |
| @%p12 ld.global.b16 { %rs135 }, [ %rd68 + 0 ]; | |
| .loc 1 235 16 | |
| mov.u16 %rs100, 0x0; | |
| @%p12 ld.global.b16 { %rs100 }, [ %rd69 + 0 ]; | |
| .loc 1 236 24 | |
| mov.b32 %r704, %f417; | |
| cvt.rn.bf16.f32 %rs134, %r704; | |
| mov.b32 %r705, %f418; | |
| cvt.rn.bf16.f32 %rs137, %r705; | |
| mov.b32 %r706, %f419; | |
| cvt.rn.bf16.f32 %rs140, %r706; | |
| mov.b32 %r707, %f420; | |
| cvt.rn.bf16.f32 %rs143, %r707; | |
| mov.b32 %r708, %f421; | |
| cvt.rn.bf16.f32 %rs146, %r708; | |
| mov.b32 %r709, %f422; | |
| cvt.rn.bf16.f32 %rs149, %r709; | |
| mov.b32 %r710, %f423; | |
| cvt.rn.bf16.f32 %rs152, %r710; | |
| mov.b32 %r711, %f424; | |
| cvt.rn.bf16.f32 %rs155, %r711; | |
| mov.b32 %r712, %f425; | |
| cvt.rn.bf16.f32 %rs158, %r712; | |
| mov.b32 %r713, %f426; | |
| cvt.rn.bf16.f32 %rs161, %r713; | |
| mov.b32 %r714, %f427; | |
| cvt.rn.bf16.f32 %rs164, %r714; | |
| mov.b32 %r715, %f428; | |
| cvt.rn.bf16.f32 %rs167, %r715; | |
| mov.b32 %r716, %f429; | |
| cvt.rn.bf16.f32 %rs170, %r716; | |
| mov.b32 %r717, %f430; | |
| cvt.rn.bf16.f32 %rs173, %r717; | |
| mov.b32 %r718, %f431; | |
| cvt.rn.bf16.f32 %rs176, %r718; | |
| mov.b32 %r719, %f432; | |
| cvt.rn.bf16.f32 %rs179, %r719; | |
| mov.b32 %r720, %f433; | |
| cvt.rn.bf16.f32 %rs182, %r720; | |
| mov.b32 %r721, %f434; | |
| cvt.rn.bf16.f32 %rs185, %r721; | |
| mov.b32 %r722, %f435; | |
| cvt.rn.bf16.f32 %rs188, %r722; | |
| mov.b32 %r723, %f436; | |
| cvt.rn.bf16.f32 %rs191, %r723; | |
| mov.b32 %r724, %f437; | |
| cvt.rn.bf16.f32 %rs194, %r724; | |
| mov.b32 %r725, %f438; | |
| cvt.rn.bf16.f32 %rs197, %r725; | |
| mov.b32 %r726, %f439; | |
| cvt.rn.bf16.f32 %rs200, %r726; | |
| mov.b32 %r727, %f440; | |
| cvt.rn.bf16.f32 %rs203, %r727; | |
| mov.b32 %r728, %f441; | |
| cvt.rn.bf16.f32 %rs206, %r728; | |
| mov.b32 %r729, %f442; | |
| cvt.rn.bf16.f32 %rs209, %r729; | |
| mov.b32 %r730, %f443; | |
| cvt.rn.bf16.f32 %rs212, %r730; | |
| mov.b32 %r731, %f444; | |
| cvt.rn.bf16.f32 %rs215, %r731; | |
| mov.b32 %r732, %f445; | |
| cvt.rn.bf16.f32 %rs218, %r732; | |
| mov.b32 %r733, %f446; | |
| cvt.rn.bf16.f32 %rs221, %r733; | |
| mov.b32 %r734, %f447; | |
| cvt.rn.bf16.f32 %rs224, %r734; | |
| mov.b32 %r735, %f448; | |
| cvt.rn.bf16.f32 %rs227, %r735; | |
| .loc 1 236 39 | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs230, %rs134, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs233, %rs137, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs236, %rs140, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs239, %rs143, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs242, %rs146, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs245, %rs149, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs248, %rs152, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs251, %rs155, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs254, %rs158, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs257, %rs161, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs260, %rs164, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs263, %rs167, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs266, %rs170, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs269, %rs173, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs272, %rs176, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs275, %rs179, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs278, %rs182, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs281, %rs185, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs284, %rs188, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs287, %rs191, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs290, %rs194, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs293, %rs197, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs296, %rs200, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs299, %rs203, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs302, %rs206, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs305, %rs209, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs308, %rs212, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs311, %rs215, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs314, %rs218, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs317, %rs221, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs320, %rs224, %rs135, c; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x8000U; | |
| fma.rn.bf16 %rs323, %rs227, %rs135, c; } | |
| .loc 1 236 43 | |
| shl.b32 %r756, %r752, 1; | |
| add.s32 %r758, %r250, %r756; | |
| st.shared.u16 [%r758], %rs100; | |
| bar.sync 0; | |
| shl.b32 %r759, %r6, 1; | |
| or.b32 %r761, %r18, %r759; | |
| add.s32 %r762, %r250, %r761; | |
| ld.shared.u16 %rs237, [%r762]; | |
| ld.shared.u16 %rs240, [%r762+2]; | |
| ld.shared.u16 %rs249, [%r762+32]; | |
| ld.shared.u16 %rs252, [%r762+34]; | |
| ld.shared.u16 %rs261, [%r762+64]; | |
| ld.shared.u16 %rs264, [%r762+66]; | |
| ld.shared.u16 %rs273, [%r762+96]; | |
| ld.shared.u16 %rs276, [%r762+98]; | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs229, %rs230, c, %rs237; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs232, %rs233, c, %rs240; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs235, %rs236, c, %rs237; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs238, %rs239, c, %rs240; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs241, %rs242, c, %rs249; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs244, %rs245, c, %rs252; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs247, %rs248, c, %rs249; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs250, %rs251, c, %rs252; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs253, %rs254, c, %rs261; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs256, %rs257, c, %rs264; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs259, %rs260, c, %rs261; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs262, %rs263, c, %rs264; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs265, %rs266, c, %rs273; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs268, %rs269, c, %rs276; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs271, %rs272, c, %rs273; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs274, %rs275, c, %rs276; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs277, %rs278, c, %rs237; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs280, %rs281, c, %rs240; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs283, %rs284, c, %rs237; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs286, %rs287, c, %rs240; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs289, %rs290, c, %rs249; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs292, %rs293, c, %rs252; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs295, %rs296, c, %rs249; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs298, %rs299, c, %rs252; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs301, %rs302, c, %rs261; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs304, %rs305, c, %rs264; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs307, %rs308, c, %rs261; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs310, %rs311, c, %rs264; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs313, %rs314, c, %rs273; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs316, %rs317, c, %rs276; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs319, %rs320, c, %rs273; } | |
| { .reg .b16 c; | |
| mov.b16 c, 0x3f80U; | |
| fma.rn.bf16 %rs322, %rs323, c, %rs276; } | |
| .loc 1 243 33 | |
| mul.lo.s32 %r763, %r10, %r122; | |
| mul.lo.s32 %r764, %r11, %r122; | |
| shl.b32 %r765, %r122, 4; | |
| add.s32 %r766, %r763, %r765; | |
| shl.b32 %r767, %r122, 3; | |
| add.s32 %r768, %r766, %r767; | |
| .loc 1 243 21 | |
| mul.wide.s32 %rd75, %r763, 2; | |
| add.s64 %rd76, %rd31, %rd75; | |
| mul.wide.s32 %rd77, %r764, 2; | |
| add.s64 %rd78, %rd31, %rd77; | |
| mul.wide.s32 %rd79, %r766, 2; | |
| add.s64 %rd80, %rd31, %rd79; | |
| mul.wide.s32 %rd81, %r768, 2; | |
| add.s64 %rd82, %rd31, %rd81; | |
| .loc 1 243 52 | |
| mul.wide.s32 %rd83, %r755, 2; | |
| add.s64 %rd70, %rd76, %rd83; | |
| add.s64 %rd71, %rd78, %rd83; | |
| add.s64 %rd72, %rd80, %rd83; | |
| add.s64 %rd73, %rd82, %rd83; | |
| .loc 1 244 33 | |
| setp.lt.s32 %p18, %r10, %r119; | |
| setp.lt.s32 %p19, %r11, %r119; | |
| setp.lt.s32 %p20, %r12, %r119; | |
| setp.lt.s32 %p21, %r13, %r119; | |
| .loc 1 244 58 | |
| setp.lt.s32 %p22, %r755, %r120; | |
| .loc 1 244 39 | |
| and.pred %p14, %p18, %p22; | |
| and.pred %p15, %p19, %p22; | |
| and.pred %p16, %p20, %p22; | |
| and.pred %p17, %p21, %p22; | |
| .loc 1 245 21 | |
| bar.sync 0; | |
| mad.lo.s32 %r769, %r5, 144, %r762; | |
| st.shared.v2.u16 [%r769], {%rs229, %rs232}; | |
| st.shared.v2.u16 [%r769+1152], {%rs235, %rs238}; | |
| st.shared.v2.u16 [%r769+32], {%rs241, %rs244}; | |
| st.shared.v2.u16 [%r769+1184], {%rs247, %rs250}; | |
| st.shared.v2.u16 [%r769+64], {%rs253, %rs256}; | |
| st.shared.v2.u16 [%r769+1216], {%rs259, %rs262}; | |
| st.shared.v2.u16 [%r769+96], {%rs265, %rs268}; | |
| st.shared.v2.u16 [%r769+1248], {%rs271, %rs274}; | |
| bar.sync 0; | |
| mad.lo.s32 %r770, %r4, 72, %r7; | |
| shl.b32 %r771, %r770, 1; | |
| add.s32 %r772, %r250, %r771; | |
| ld.shared.v4.u32 {%r736, %r737, %r738, %r739}, [%r772]; | |
| ld.shared.v4.u32 {%r740, %r741, %r742, %r743}, [%r772+1152]; | |
| bar.sync 0; | |
| st.shared.v2.u16 [%r769], {%rs277, %rs280}; | |
| st.shared.v2.u16 [%r769+1152], {%rs283, %rs286}; | |
| st.shared.v2.u16 [%r769+32], {%rs289, %rs292}; | |
| st.shared.v2.u16 [%r769+1184], {%rs295, %rs298}; | |
| st.shared.v2.u16 [%r769+64], {%rs301, %rs304}; | |
| st.shared.v2.u16 [%r769+1216], {%rs307, %rs310}; | |
| st.shared.v2.u16 [%r769+96], {%rs313, %rs316}; | |
| st.shared.v2.u16 [%r769+1248], {%rs319, %rs322}; | |
| bar.sync 0; | |
| ld.shared.v4.u32 {%r744, %r745, %r746, %r747}, [%r772]; | |
| ld.shared.v4.u32 {%r748, %r749, %r750, %r751}, [%r772+1152]; | |
| @%p14 st.global.v4.b32 [ %rd70 + 0 ], { %r736, %r737, %r738, %r739 }; | |
| @%p15 st.global.v4.b32 [ %rd71 + 0 ], { %r740, %r741, %r742, %r743 }; | |
| @%p16 st.global.v4.b32 [ %rd72 + 0 ], { %r744, %r745, %r746, %r747 }; | |
| @%p17 st.global.v4.b32 [ %rd73 + 0 ], { %r748, %r749, %r750, %r751 }; | |
| .loc 1 245 4 | |
| ret; | |
| $L__tmp8: | |
| $L__func_end0: | |
| } | |
| .file 1 "/home/cdhernandez/local/test.py" | |
| .file 2 "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/triton/language/standard.py" | |
| .file 3 "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/triton/language/core.py" | |
| .section .debug_abbrev | |
| { | |
| .b8 1 | |
| .b8 17 | |
| .b8 1 | |
| .b8 37 | |
| .b8 8 | |
| .b8 19 | |
| .b8 5 | |
| .b8 3 | |
| .b8 8 | |
| .b8 16 | |
| .b8 6 | |
| .b8 27 | |
| .b8 8 | |
| .b8 180 | |
| .b8 66 | |
| .b8 12 | |
| .b8 17 | |
| .b8 1 | |
| .b8 18 | |
| .b8 1 | |
| .b8 0 | |
| .b8 0 | |
| .b8 2 | |
| .b8 46 | |
| .b8 0 | |
| .b8 135 | |
| .b8 64 | |
| .b8 8 | |
| .b8 3 | |
| .b8 8 | |
| .b8 58 | |
| .b8 11 | |
| .b8 59 | |
| .b8 11 | |
| .b8 63 | |
| .b8 12 | |
| .b8 32 | |
| .b8 11 | |
| .b8 0 | |
| .b8 0 | |
| .b8 3 | |
| .b8 46 | |
| .b8 1 | |
| .b8 17 | |
| .b8 1 | |
| .b8 18 | |
| .b8 1 | |
| .b8 64 | |
| .b8 10 | |
| .b8 49 | |
| .b8 19 | |
| .b8 0 | |
| .b8 0 | |
| .b8 4 | |
| .b8 29 | |
| .b8 0 | |
| .b8 49 | |
| .b8 19 | |
| .b8 17 | |
| .b8 1 | |
| .b8 18 | |
| .b8 1 | |
| .b8 88 | |
| .b8 11 | |
| .b8 89 | |
| .b8 11 | |
| .b8 87 | |
| .b8 11 | |
| .b8 0 | |
| .b8 0 | |
| .b8 0 | |
| } | |
| .section .debug_info | |
| { | |
| .b32 330 | |
| .b8 2 | |
| .b8 0 | |
| .b32 .debug_abbrev | |
| .b8 8 | |
| .b8 1 | |
| .b8 116 | |
| .b8 114 | |
| .b8 105 | |
| .b8 116 | |
| .b8 111 | |
| .b8 110 | |
| .b8 0 | |
| .b8 2 | |
| .b8 0 | |
| .b8 116 | |
| .b8 101 | |
| .b8 115 | |
| .b8 116 | |
| .b8 46 | |
| .b8 112 | |
| .b8 121 | |
| .b8 0 | |
| .b32 .debug_line | |
| .b8 47 | |
| .b8 104 | |
| .b8 111 | |
| .b8 109 | |
| .b8 101 | |
| .b8 47 | |
| .b8 99 | |
| .b8 100 | |
| .b8 104 | |
| .b8 101 | |
| .b8 114 | |
| .b8 110 | |
| .b8 97 | |
| .b8 110 | |
| .b8 100 | |
| .b8 101 | |
| .b8 122 | |
| .b8 47 | |
| .b8 108 | |
| .b8 111 | |
| .b8 99 | |
| .b8 97 | |
| .b8 108 | |
| .b8 0 | |
| .b8 1 | |
| .b64 $L__func_begin0 | |
| .b64 $L__func_end0 | |
| .b8 2 | |
| .b8 105 | |
| .b8 110 | |
| .b8 116 | |
| .b8 56 | |
| .b8 95 | |
| .b8 119 | |
| .b8 101 | |
| .b8 105 | |
| .b8 103 | |
| .b8 104 | |
| .b8 116 | |
| .b8 95 | |
| .b8 111 | |
| .b8 110 | |
| .b8 108 | |
| .b8 121 | |
| .b8 95 | |
| .b8 108 | |
| .b8 105 | |
| .b8 110 | |
| .b8 101 | |
| .b8 97 | |
| .b8 114 | |
| .b8 95 | |
| .b8 107 | |
| .b8 101 | |
| .b8 114 | |
| .b8 110 | |
| .b8 101 | |
| .b8 108 | |
| .b8 95 | |
| .b8 48 | |
| .b8 100 | |
| .b8 49 | |
| .b8 100 | |
| .b8 50 | |
| .b8 100 | |
| .b8 51 | |
| .b8 100 | |
| .b8 52 | |
| .b8 100 | |
| .b8 53 | |
| .b8 100 | |
| .b8 54 | |
| .b8 100 | |
| .b8 55 | |
| .b8 100 | |
| .b8 56 | |
| .b8 100 | |
| .b8 57 | |
| .b8 99 | |
| .b8 49 | |
| .b8 48 | |
| .b8 99 | |
| .b8 49 | |
| .b8 49 | |
| .b8 100 | |
| .b8 49 | |
| .b8 50 | |
| .b8 99 | |
| .b8 49 | |
| .b8 51 | |
| .b8 100 | |
| .b8 49 | |
| .b8 52 | |
| .b8 99 | |
| .b8 0 | |
| .b8 105 | |
| .b8 110 | |
| .b8 116 | |
| .b8 56 | |
| .b8 95 | |
| .b8 119 | |
| .b8 101 | |
| .b8 105 | |
| .b8 103 | |
| .b8 104 | |
| .b8 116 | |
| .b8 95 | |
| .b8 111 | |
| .b8 110 | |
| .b8 108 | |
| .b8 121 | |
| .b8 95 | |
| .b8 108 | |
| .b8 105 | |
| .b8 110 | |
| .b8 101 | |
| .b8 97 | |
| .b8 114 | |
| .b8 95 | |
| .b8 107 | |
| .b8 101 | |
| .b8 114 | |
| .b8 110 | |
| .b8 101 | |
| .b8 108 | |
| .b8 95 | |
| .b8 48 | |
| .b8 100 | |
| .b8 49 | |
| .b8 100 | |
| .b8 50 | |
| .b8 100 | |
| .b8 51 | |
| .b8 100 | |
| .b8 52 | |
| .b8 100 | |
| .b8 53 | |
| .b8 100 | |
| .b8 54 | |
| .b8 100 | |
| .b8 55 | |
| .b8 100 | |
| .b8 56 | |
| .b8 100 | |
| .b8 57 | |
| .b8 99 | |
| .b8 49 | |
| .b8 48 | |
| .b8 99 | |
| .b8 49 | |
| .b8 49 | |
| .b8 100 | |
| .b8 49 | |
| .b8 50 | |
| .b8 99 | |
| .b8 49 | |
| .b8 51 | |
| .b8 100 | |
| .b8 49 | |
| .b8 52 | |
| .b8 99 | |
| .b8 0 | |
| .b8 1 | |
| .b8 167 | |
| .b8 1 | |
| .b8 1 | |
| .b8 3 | |
| .b64 $L__func_begin0 | |
| .b64 $L__func_end0 | |
| .b8 1 | |
| .b8 156 | |
| .b32 74 | |
| .b8 4 | |
| .b32 74 | |
| .b64 $L__tmp1 | |
| .b64 $L__tmp2 | |
| .b8 2 | |
| .b8 191 | |
| .b8 27 | |
| .b8 4 | |
| .b32 74 | |
| .b64 $L__tmp2 | |
| .b64 $L__tmp3 | |
| .b8 2 | |
| .b8 192 | |
| .b8 27 | |
| .b8 4 | |
| .b32 74 | |
| .b64 $L__tmp4 | |
| .b64 $L__tmp5 | |
| .b8 3 | |
| .b8 196 | |
| .b8 48 | |
| .b8 4 | |
| .b32 74 | |
| .b64 $L__tmp6 | |
| .b64 $L__tmp7 | |
| .b8 2 | |
| .b8 223 | |
| .b8 33 | |
| .b8 0 | |
| .b8 0 | |
| } | |
| .section .debug_pubnames | |
| { | |
| .b32 $L__pubNames_end0-$L__pubNames_start0 | |
| $L__pubNames_start0: | |
| .b8 2 | |
| .b8 0 | |
| .b32 .debug_info | |
| .b32 334 | |
| .b32 74 | |
| .b8 105 | |
| .b8 110 | |
| .b8 116 | |
| .b8 56 | |
| .b8 95 | |
| .b8 119 | |
| .b8 101 | |
| .b8 105 | |
| .b8 103 | |
| .b8 104 | |
| .b8 116 | |
| .b8 95 | |
| .b8 111 | |
| .b8 110 | |
| .b8 108 | |
| .b8 121 | |
| .b8 95 | |
| .b8 108 | |
| .b8 105 | |
| .b8 110 | |
| .b8 101 | |
| .b8 97 | |
| .b8 114 | |
| .b8 95 | |
| .b8 107 | |
| .b8 101 | |
| .b8 114 | |
| .b8 110 | |
| .b8 101 | |
| .b8 108 | |
| .b8 95 | |
| .b8 48 | |
| .b8 100 | |
| .b8 49 | |
| .b8 100 | |
| .b8 50 | |
| .b8 100 | |
| .b8 51 | |
| .b8 100 | |
| .b8 52 | |
| .b8 100 | |
| .b8 53 | |
| .b8 100 | |
| .b8 54 | |
| .b8 100 | |
| .b8 55 | |
| .b8 100 | |
| .b8 56 | |
| .b8 100 | |
| .b8 57 | |
| .b8 99 | |
| .b8 49 | |
| .b8 48 | |
| .b8 99 | |
| .b8 49 | |
| .b8 49 | |
| .b8 100 | |
| .b8 49 | |
| .b8 50 | |
| .b8 99 | |
| .b8 49 | |
| .b8 51 | |
| .b8 100 | |
| .b8 49 | |
| .b8 52 | |
| .b8 99 | |
| .b8 0 | |
| .b32 0 | |
| $L__pubNames_end0: | |
| } | |
| .section .debug_pubtypes | |
| { | |
| .b32 $L__pubTypes_end0-$L__pubTypes_start0 | |
| $L__pubTypes_start0: | |
| .b8 2 | |
| .b8 0 | |
| .b32 .debug_info | |
| .b32 334 | |
| .b32 0 | |
| $L__pubTypes_end0: | |
| } | |
| .section .debug_loc { } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment