Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active February 4, 2024 19:33
Show Gist options
  • Save antiagainst/5157cafb6f54c763ea216d84ba56b9d0 to your computer and use it in GitHub Desktop.
Save antiagainst/5157cafb6f54c763ea216d84ba56b9d0 to your computer and use it in GitHub Desktop.
//
// Generated by LLVM NVPTX Back-End
//
.version 7.6
.target sm_80
.address_size 64
// .globl matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32
.extern .shared .align 16 .b8 __dynamic_shared_memory__[];
.visible .entry matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32(
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0,
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1,
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2
)
.maxntid 128, 2, 1
{
.reg .pred %p<3>;
.reg .b32 %r<143>;
.reg .f32 %f<837>;
.reg .b64 %rd<542>;
ld.param.u64 %rd88, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2];
ld.param.u64 %rd94, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0];
ld.param.u64 %rd95, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1];
mov.u32 %r49, %tid.x;
cvt.u64.u32 %rd96, %r49;
mov.u32 %r50, %tid.y;
mov.u32 %r51, %ctaid.y;
mov.u32 %r52, %ctaid.x;
shl.b32 %r53, %r51, 7;
cvt.u64.u32 %rd1, %r53;
mul.wide.u32 %rd97, %r50, 32;
shr.u64 %rd98, %rd96, 2;
or.b64 %rd99, %rd98, %rd97;
or.b64 %rd100, %rd99, %rd1;
mul.wide.u32 %rd101, %r49, 4;
shl.b64 %rd102, %rd98, 4;
sub.s64 %rd103, %rd101, %rd102;
shr.u64 %rd104, %rd96, 1;
and.b64 %rd105, %rd104, 12;
xor.b64 %rd106, %rd103, %rd105;
mul.wide.u32 %rd107, %r52, 256;
shr.u64 %rd108, %rd96, 6;
shl.b64 %rd109, %rd108, 8;
sub.s64 %rd2, %rd101, %rd109;
add.s64 %rd3, %rd2, %rd107;
mul.wide.u32 %rd110, %r50, 2;
or.b64 %rd4, %rd110, %rd108;
or.b64 %rd111, %rd4, 4;
or.b64 %rd112, %rd4, 8;
or.b64 %rd113, %rd4, 12;
mov.u32 %r54, %laneid;
cvt.s64.s32 %rd114, %r54;
mul.wide.u32 %rd5, %r50, 64;
add.s64 %rd115, %rd5, %rd114;
shr.s32 %r55, %r54, 31;
xor.b32 %r56, %r55, %r54;
shr.s32 %r57, %r56, 31;
shr.u32 %r58, %r57, 28;
add.s32 %r59, %r56, %r58;
shr.s32 %r60, %r59, 4;
xor.b32 %r61, %r60, %r55;
mul.wide.s32 %rd116, %r61, 4;
mul.wide.s32 %rd6, %r54, 2;
and.b64 %rd117, %rd6, 12;
xor.b64 %rd7, %rd116, %rd117;
add.s64 %rd118, %rd116, 8;
xor.b64 %rd8, %rd118, %rd117;
shr.u32 %r62, %r57, 30;
add.s32 %r63, %r56, %r62;
shr.s32 %r64, %r63, 2;
xor.b32 %r65, %r64, %r55;
cvt.s64.s32 %rd9, %r65;
mul.wide.u32 %rd119, %r49, 2;
and.b64 %rd10, %rd119, 192;
add.s64 %rd120, %rd10, %rd9;
and.b64 %rd121, %rd114, 3;
shl.b64 %rd122, %rd121, 2;
xor.b64 %rd11, %rd120, %rd122;
or.b64 %rd123, %rd121, 4;
shl.b64 %rd124, %rd123, 2;
xor.b64 %rd12, %rd120, %rd124;
or.b64 %rd125, %rd121, 8;
shl.b64 %rd126, %rd125, 2;
xor.b64 %rd13, %rd120, %rd126;
or.b64 %rd127, %rd121, 12;
shl.b64 %rd128, %rd127, 2;
xor.b64 %rd14, %rd120, %rd128;
add.s64 %rd129, %rd120, 8;
xor.b64 %rd15, %rd129, %rd122;
xor.b64 %rd16, %rd129, %rd124;
xor.b64 %rd17, %rd129, %rd126;
xor.b64 %rd18, %rd129, %rd128;
add.s64 %rd130, %rd120, 16;
xor.b64 %rd19, %rd130, %rd122;
xor.b64 %rd20, %rd130, %rd124;
xor.b64 %rd21, %rd130, %rd126;
xor.b64 %rd22, %rd130, %rd128;
add.s64 %rd131, %rd120, 24;
xor.b64 %rd23, %rd131, %rd122;
xor.b64 %rd24, %rd131, %rd124;
xor.b64 %rd25, %rd131, %rd126;
xor.b64 %rd26, %rd131, %rd128;
add.s64 %rd132, %rd120, 32;
xor.b64 %rd27, %rd132, %rd122;
xor.b64 %rd28, %rd132, %rd124;
xor.b64 %rd29, %rd132, %rd126;
xor.b64 %rd30, %rd132, %rd128;
add.s64 %rd133, %rd120, 40;
xor.b64 %rd31, %rd133, %rd122;
xor.b64 %rd32, %rd133, %rd124;
xor.b64 %rd33, %rd133, %rd126;
xor.b64 %rd34, %rd133, %rd128;
add.s64 %rd134, %rd120, 48;
xor.b64 %rd35, %rd134, %rd122;
xor.b64 %rd36, %rd134, %rd124;
xor.b64 %rd37, %rd134, %rd126;
xor.b64 %rd38, %rd134, %rd128;
add.s64 %rd135, %rd120, 56;
xor.b64 %rd39, %rd135, %rd122;
xor.b64 %rd40, %rd135, %rd124;
xor.b64 %rd41, %rd135, %rd126;
xor.b64 %rd42, %rd135, %rd128;
shl.b64 %rd43, %rd99, 4;
shl.b64 %rd136, %rd99, 6;
mov.u64 %rd137, __dynamic_shared_memory__;
shl.b64 %rd138, %rd106, 2;
add.s64 %rd60, %rd137, %rd138;
add.s64 %rd44, %rd60, %rd136;
shl.b64 %rd139, %rd100, 13;
add.s64 %rd140, %rd94, %rd139;
shl.b64 %rd141, %rd103, 2;
add.s64 %rd142, %rd140, %rd141;
cp.async.cg.shared.global [%rd44], [%rd142], 16;
add.s64 %rd143, %rd44, 4096;
or.b64 %rd144, %rd139, 524288;
add.s64 %rd145, %rd94, %rd144;
add.s64 %rd146, %rd145, %rd141;
cp.async.cg.shared.global [%rd143], [%rd146], 16;
shl.b64 %rd45, %rd4, 8;
add.s64 %rd147, %rd137, 32768;
shl.b64 %rd148, %rd4, 4;
shl.b64 %rd149, %rd2, 2;
xor.b64 %rd150, %rd148, %rd149;
shl.b64 %rd151, %rd4, 10;
add.s64 %rd61, %rd147, %rd150;
add.s64 %rd152, %rd61, %rd151;
shl.b64 %rd153, %rd4, 12;
shl.b64 %rd154, %rd3, 2;
add.s64 %rd62, %rd95, %rd154;
add.s64 %rd155, %rd62, %rd153;
cp.async.cg.shared.global [%rd152], [%rd155], 16;
shl.b64 %rd46, %rd111, 8;
shl.b64 %rd156, %rd111, 4;
xor.b64 %rd157, %rd156, %rd149;
shl.b64 %rd158, %rd111, 10;
add.s64 %rd63, %rd147, %rd157;
add.s64 %rd159, %rd63, %rd158;
shl.b64 %rd160, %rd111, 12;
add.s64 %rd161, %rd62, %rd160;
cp.async.cg.shared.global [%rd159], [%rd161], 16;
shl.b64 %rd47, %rd112, 8;
shl.b64 %rd162, %rd112, 4;
xor.b64 %rd163, %rd162, %rd149;
shl.b64 %rd164, %rd112, 10;
add.s64 %rd64, %rd147, %rd163;
add.s64 %rd165, %rd64, %rd164;
shl.b64 %rd166, %rd112, 12;
add.s64 %rd167, %rd62, %rd166;
cp.async.cg.shared.global [%rd165], [%rd167], 16;
shl.b64 %rd48, %rd113, 8;
shl.b64 %rd168, %rd113, 4;
xor.b64 %rd169, %rd168, %rd149;
shl.b64 %rd170, %rd113, 10;
add.s64 %rd65, %rd147, %rd169;
add.s64 %rd171, %rd65, %rd170;
shl.b64 %rd172, %rd113, 12;
add.s64 %rd173, %rd62, %rd172;
cp.async.cg.shared.global [%rd171], [%rd173], 16;
cp.async.commit_group;
add.s64 %rd174, %rd44, 8192;
add.s64 %rd175, %rd142, 64;
cp.async.cg.shared.global [%rd174], [%rd175], 16;
add.s64 %rd176, %rd44, 12288;
add.s64 %rd177, %rd146, 64;
cp.async.cg.shared.global [%rd176], [%rd177], 16;
or.b64 %rd49, %rd4, 16;
add.s64 %rd178, %rd152, 16384;
shl.b64 %rd179, %rd49, 12;
add.s64 %rd180, %rd62, %rd179;
cp.async.cg.shared.global [%rd178], [%rd180], 16;
or.b64 %rd50, %rd4, 20;
add.s64 %rd181, %rd159, 16384;
shl.b64 %rd182, %rd50, 12;
add.s64 %rd183, %rd62, %rd182;
cp.async.cg.shared.global [%rd181], [%rd183], 16;
or.b64 %rd51, %rd4, 24;
add.s64 %rd184, %rd165, 16384;
shl.b64 %rd185, %rd51, 12;
add.s64 %rd186, %rd62, %rd185;
cp.async.cg.shared.global [%rd184], [%rd186], 16;
or.b64 %rd52, %rd4, 28;
add.s64 %rd187, %rd171, 16384;
shl.b64 %rd188, %rd52, 12;
add.s64 %rd189, %rd62, %rd188;
cp.async.cg.shared.global [%rd187], [%rd189], 16;
cp.async.commit_group;
add.s64 %rd190, %rd44, 16384;
add.s64 %rd191, %rd142, 128;
cp.async.cg.shared.global [%rd190], [%rd191], 16;
add.s64 %rd192, %rd44, 20480;
add.s64 %rd193, %rd146, 128;
cp.async.cg.shared.global [%rd192], [%rd193], 16;
or.b64 %rd53, %rd4, 32;
add.s64 %rd194, %rd152, 32768;
shl.b64 %rd195, %rd53, 12;
add.s64 %rd196, %rd62, %rd195;
cp.async.cg.shared.global [%rd194], [%rd196], 16;
or.b64 %rd54, %rd4, 36;
add.s64 %rd197, %rd159, 32768;
shl.b64 %rd198, %rd54, 12;
add.s64 %rd199, %rd62, %rd198;
cp.async.cg.shared.global [%rd197], [%rd199], 16;
or.b64 %rd55, %rd4, 40;
add.s64 %rd200, %rd165, 32768;
shl.b64 %rd201, %rd55, 12;
add.s64 %rd202, %rd62, %rd201;
cp.async.cg.shared.global [%rd200], [%rd202], 16;
or.b64 %rd56, %rd4, 44;
add.s64 %rd203, %rd171, 32768;
shl.b64 %rd204, %rd56, 12;
add.s64 %rd205, %rd62, %rd204;
cp.async.cg.shared.global [%rd203], [%rd205], 16;
cp.async.commit_group;
cp.async.wait_group 2;
bar.sync 0;
mul.wide.s32 %rd206, %r61, 256;
shl.b64 %rd207, %rd115, 4;
sub.s64 %rd57, %rd207, %rd206;
shl.b64 %rd208, %rd57, 2;
add.s64 %rd209, %rd137, %rd208;
shl.b64 %rd210, %rd7, 2;
add.s64 %rd211, %rd209, %rd210;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd211];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd211+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd211+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd211+3072];
shl.b64 %rd58, %rd121, 8;
shl.b64 %rd212, %rd121, 10;
add.s64 %rd213, %rd147, %rd212;
shl.b64 %rd214, %rd11, 2;
add.s64 %rd215, %rd213, %rd214;
ld.shared.f32 %f708, [%rd215];
shl.b64 %rd59, %rd123, 8;
shl.b64 %rd216, %rd123, 10;
add.s64 %rd217, %rd147, %rd216;
shl.b64 %rd218, %rd12, 2;
add.s64 %rd219, %rd217, %rd218;
ld.shared.f32 %f707, [%rd219];
shl.b64 %rd220, %rd15, 2;
add.s64 %rd221, %rd213, %rd220;
ld.shared.f32 %f706, [%rd221];
shl.b64 %rd222, %rd16, 2;
add.s64 %rd223, %rd217, %rd222;
ld.shared.f32 %f705, [%rd223];
shl.b64 %rd224, %rd19, 2;
add.s64 %rd225, %rd213, %rd224;
ld.shared.f32 %f704, [%rd225];
shl.b64 %rd226, %rd20, 2;
add.s64 %rd227, %rd217, %rd226;
ld.shared.f32 %f703, [%rd227];
shl.b64 %rd228, %rd23, 2;
add.s64 %rd229, %rd213, %rd228;
ld.shared.f32 %f702, [%rd229];
shl.b64 %rd230, %rd24, 2;
add.s64 %rd231, %rd217, %rd230;
ld.shared.f32 %f701, [%rd231];
shl.b64 %rd232, %rd27, 2;
add.s64 %rd233, %rd213, %rd232;
ld.shared.f32 %f700, [%rd233];
shl.b64 %rd234, %rd28, 2;
add.s64 %rd235, %rd217, %rd234;
ld.shared.f32 %f699, [%rd235];
shl.b64 %rd236, %rd31, 2;
add.s64 %rd237, %rd213, %rd236;
ld.shared.f32 %f698, [%rd237];
shl.b64 %rd238, %rd32, 2;
add.s64 %rd239, %rd217, %rd238;
ld.shared.f32 %f697, [%rd239];
shl.b64 %rd240, %rd35, 2;
add.s64 %rd241, %rd213, %rd240;
ld.shared.f32 %f696, [%rd241];
shl.b64 %rd242, %rd36, 2;
add.s64 %rd243, %rd217, %rd242;
ld.shared.f32 %f695, [%rd243];
shl.b64 %rd244, %rd39, 2;
add.s64 %rd245, %rd213, %rd244;
ld.shared.f32 %f694, [%rd245];
shl.b64 %rd246, %rd40, 2;
add.s64 %rd247, %rd217, %rd246;
ld.shared.f32 %f693, [%rd247];
shl.b64 %rd66, %rd125, 8;
shl.b64 %rd67, %rd127, 8;
mul.wide.u32 %rd248, %r49, 16;
add.s64 %rd249, %rd248, %rd144;
shl.b64 %rd250, %rd98, 6;
sub.s64 %rd251, %rd249, %rd250;
add.s64 %rd252, %rd251, %rd94;
add.s64 %rd538, %rd252, 192;
mul.wide.u32 %rd253, %r51, 1048576;
mul.wide.u32 %rd254, %r50, 262144;
add.s64 %rd255, %rd253, %rd254;
mul.lo.s64 %rd256, %rd98, 8128;
add.s64 %rd257, %rd255, %rd256;
add.s64 %rd258, %rd257, %rd248;
add.s64 %rd259, %rd258, %rd94;
add.s64 %rd537, %rd259, 192;
mul.wide.u32 %rd260, %r50, 8192;
mul.lo.s64 %rd261, %rd108, 3072;
add.s64 %rd262, %rd260, %rd261;
mul.wide.u32 %rd263, %r52, 1024;
add.s64 %rd264, %rd262, %rd263;
add.s64 %rd265, %rd264, %rd248;
add.s64 %rd266, %rd265, %rd95;
add.s64 %rd535, %rd266, 196608;
add.s64 %rd267, %rd108, %rd110;
shl.b64 %rd268, %rd267, 10;
add.s64 %rd534, %rd268, 49152;
mov.f32 %f709, 0f00000000;
mov.u64 %rd541, 0;
mov.u64 %rd540, 1;
mov.u64 %rd539, 2;
mov.u64 %rd536, -16;
mov.u64 %rd533, 3;
shl.b64 %rd280, %rd8, 2;
shl.b64 %rd287, %rd13, 2;
shl.b64 %rd292, %rd14, 2;
shl.b64 %rd294, %rd17, 2;
shl.b64 %rd296, %rd18, 2;
shl.b64 %rd298, %rd21, 2;
shl.b64 %rd300, %rd22, 2;
shl.b64 %rd302, %rd25, 2;
shl.b64 %rd304, %rd26, 2;
shl.b64 %rd306, %rd29, 2;
shl.b64 %rd308, %rd30, 2;
shl.b64 %rd310, %rd33, 2;
shl.b64 %rd312, %rd34, 2;
shl.b64 %rd314, %rd37, 2;
shl.b64 %rd316, %rd38, 2;
shl.b64 %rd318, %rd41, 2;
shl.b64 %rd320, %rd42, 2;
mov.f32 %f710, %f709;
mov.f32 %f711, %f709;
mov.f32 %f712, %f709;
mov.f32 %f713, %f709;
mov.f32 %f714, %f709;
mov.f32 %f715, %f709;
mov.f32 %f716, %f709;
mov.f32 %f717, %f709;
mov.f32 %f718, %f709;
mov.f32 %f719, %f709;
mov.f32 %f720, %f709;
mov.f32 %f721, %f709;
mov.f32 %f722, %f709;
mov.f32 %f723, %f709;
mov.f32 %f724, %f709;
mov.f32 %f725, %f709;
mov.f32 %f726, %f709;
mov.f32 %f727, %f709;
mov.f32 %f728, %f709;
mov.f32 %f729, %f709;
mov.f32 %f730, %f709;
mov.f32 %f731, %f709;
mov.f32 %f732, %f709;
mov.f32 %f733, %f709;
mov.f32 %f734, %f709;
mov.f32 %f735, %f709;
mov.f32 %f736, %f709;
mov.f32 %f737, %f709;
mov.f32 %f738, %f709;
mov.f32 %f739, %f709;
mov.f32 %f740, %f709;
mov.f32 %f741, %f709;
mov.f32 %f742, %f709;
mov.f32 %f743, %f709;
mov.f32 %f744, %f709;
mov.f32 %f745, %f709;
mov.f32 %f746, %f709;
mov.f32 %f747, %f709;
mov.f32 %f748, %f709;
mov.f32 %f749, %f709;
mov.f32 %f750, %f709;
mov.f32 %f751, %f709;
mov.f32 %f752, %f709;
mov.f32 %f753, %f709;
mov.f32 %f754, %f709;
mov.f32 %f755, %f709;
mov.f32 %f756, %f709;
mov.f32 %f757, %f709;
mov.f32 %f758, %f709;
mov.f32 %f759, %f709;
mov.f32 %f760, %f709;
mov.f32 %f761, %f709;
mov.f32 %f762, %f709;
mov.f32 %f763, %f709;
mov.f32 %f764, %f709;
mov.f32 %f765, %f709;
mov.f32 %f766, %f709;
mov.f32 %f767, %f709;
mov.f32 %f768, %f709;
mov.f32 %f769, %f709;
mov.f32 %f770, %f709;
mov.f32 %f771, %f709;
mov.f32 %f772, %f709;
mov.f32 %f773, %f709;
mov.f32 %f774, %f709;
mov.f32 %f775, %f709;
mov.f32 %f776, %f709;
mov.f32 %f777, %f709;
mov.f32 %f778, %f709;
mov.f32 %f779, %f709;
mov.f32 %f780, %f709;
mov.f32 %f781, %f709;
mov.f32 %f782, %f709;
mov.f32 %f783, %f709;
mov.f32 %f784, %f709;
mov.f32 %f785, %f709;
mov.f32 %f786, %f709;
mov.f32 %f787, %f709;
mov.f32 %f788, %f709;
mov.f32 %f789, %f709;
mov.f32 %f790, %f709;
mov.f32 %f791, %f709;
mov.f32 %f792, %f709;
mov.f32 %f793, %f709;
mov.f32 %f794, %f709;
mov.f32 %f795, %f709;
mov.f32 %f796, %f709;
mov.f32 %f797, %f709;
mov.f32 %f798, %f709;
mov.f32 %f799, %f709;
mov.f32 %f800, %f709;
mov.f32 %f801, %f709;
mov.f32 %f802, %f709;
mov.f32 %f803, %f709;
mov.f32 %f804, %f709;
mov.f32 %f805, %f709;
mov.f32 %f806, %f709;
mov.f32 %f807, %f709;
mov.f32 %f808, %f709;
mov.f32 %f809, %f709;
mov.f32 %f810, %f709;
mov.f32 %f811, %f709;
mov.f32 %f812, %f709;
mov.f32 %f813, %f709;
mov.f32 %f814, %f709;
mov.f32 %f815, %f709;
mov.f32 %f816, %f709;
mov.f32 %f817, %f709;
mov.f32 %f818, %f709;
mov.f32 %f819, %f709;
mov.f32 %f820, %f709;
mov.f32 %f821, %f709;
mov.f32 %f822, %f709;
mov.f32 %f823, %f709;
mov.f32 %f824, %f709;
mov.f32 %f825, %f709;
mov.f32 %f826, %f709;
mov.f32 %f827, %f709;
mov.f32 %f828, %f709;
mov.f32 %f829, %f709;
mov.f32 %f830, %f709;
mov.f32 %f831, %f709;
mov.f32 %f832, %f709;
mov.f32 %f833, %f709;
mov.f32 %f834, %f709;
mov.f32 %f835, %f709;
mov.f32 %f836, %f709;
$L__BB0_1:
mov.u64 %rd79, %rd540;
mov.u64 %rd540, %rd539;
add.s64 %rd536, %rd536, 16;
setp.lt.u64 %p1, %rd536, 2000;
shl.b64 %rd275, %rd541, 13;
add.s64 %rd277, %rd137, %rd275;
add.s64 %rd279, %rd277, %rd208;
add.s64 %rd281, %rd279, %rd280;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r78, %r79, %r80, %r81}, [%rd281];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r82, %r83, %r84, %r85}, [%rd281+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r86, %r87, %r88, %r89}, [%rd281+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r90, %r91, %r92, %r93}, [%rd281+3072];
shl.b64 %rd282, %rd541, 12;
or.b64 %rd283, %rd282, %rd66;
shl.b64 %rd284, %rd283, 2;
add.s64 %rd286, %rd147, %rd284;
add.s64 %rd288, %rd286, %rd287;
ld.shared.u32 %r94, [%rd288];
or.b64 %rd289, %rd282, %rd67;
shl.b64 %rd290, %rd289, 2;
add.s64 %rd291, %rd147, %rd290;
add.s64 %rd293, %rd291, %rd292;
ld.shared.u32 %r95, [%rd293];
add.s64 %rd295, %rd286, %rd294;
ld.shared.u32 %r96, [%rd295];
add.s64 %rd297, %rd291, %rd296;
ld.shared.u32 %r97, [%rd297];
add.s64 %rd299, %rd286, %rd298;
ld.shared.u32 %r98, [%rd299];
add.s64 %rd301, %rd291, %rd300;
ld.shared.u32 %r99, [%rd301];
add.s64 %rd303, %rd286, %rd302;
ld.shared.u32 %r100, [%rd303];
add.s64 %rd305, %rd291, %rd304;
ld.shared.u32 %r101, [%rd305];
add.s64 %rd307, %rd286, %rd306;
ld.shared.u32 %r102, [%rd307];
add.s64 %rd309, %rd291, %rd308;
ld.shared.u32 %r103, [%rd309];
add.s64 %rd311, %rd286, %rd310;
ld.shared.u32 %r104, [%rd311];
add.s64 %rd313, %rd291, %rd312;
ld.shared.u32 %r105, [%rd313];
add.s64 %rd315, %rd286, %rd314;
ld.shared.u32 %r106, [%rd315];
add.s64 %rd317, %rd291, %rd316;
ld.shared.u32 %r107, [%rd317];
add.s64 %rd319, %rd286, %rd318;
ld.shared.u32 %r108, [%rd319];
add.s64 %rd321, %rd291, %rd320;
ld.shared.u32 %r109, [%rd321];
mov.b32 %r110, %f708;
mov.b32 %r111, %f707;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f437, %f438, %f439, %f440},
{%r139, %r140, %r141, %r142},
{%r110, %r111},
{%f833, %f834, %f835, %f836};
mov.b32 %r112, %f706;
mov.b32 %r113, %f705;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f441, %f442, %f443, %f444},
{%r139, %r140, %r141, %r142},
{%r112, %r113},
{%f829, %f830, %f831, %f832};
mov.b32 %r114, %f704;
mov.b32 %r115, %f703;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f445, %f446, %f447, %f448},
{%r139, %r140, %r141, %r142},
{%r114, %r115},
{%f825, %f826, %f827, %f828};
mov.b32 %r116, %f702;
mov.b32 %r117, %f701;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f449, %f450, %f451, %f452},
{%r139, %r140, %r141, %r142},
{%r116, %r117},
{%f821, %f822, %f823, %f824};
mov.b32 %r118, %f700;
mov.b32 %r119, %f699;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f453, %f454, %f455, %f456},
{%r139, %r140, %r141, %r142},
{%r118, %r119},
{%f817, %f818, %f819, %f820};
mov.b32 %r120, %f698;
mov.b32 %r121, %f697;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f457, %f458, %f459, %f460},
{%r139, %r140, %r141, %r142},
{%r120, %r121},
{%f813, %f814, %f815, %f816};
mov.b32 %r122, %f696;
mov.b32 %r123, %f695;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f461, %f462, %f463, %f464},
{%r139, %r140, %r141, %r142},
{%r122, %r123},
{%f809, %f810, %f811, %f812};
mov.b32 %r124, %f694;
mov.b32 %r125, %f693;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f465, %f466, %f467, %f468},
{%r139, %r140, %r141, %r142},
{%r124, %r125},
{%f805, %f806, %f807, %f808};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f469, %f470, %f471, %f472},
{%r135, %r136, %r137, %r138},
{%r110, %r111},
{%f801, %f802, %f803, %f804};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f473, %f474, %f475, %f476},
{%r135, %r136, %r137, %r138},
{%r112, %r113},
{%f797, %f798, %f799, %f800};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f477, %f478, %f479, %f480},
{%r135, %r136, %r137, %r138},
{%r114, %r115},
{%f793, %f794, %f795, %f796};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f481, %f482, %f483, %f484},
{%r135, %r136, %r137, %r138},
{%r116, %r117},
{%f789, %f790, %f791, %f792};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f485, %f486, %f487, %f488},
{%r135, %r136, %r137, %r138},
{%r118, %r119},
{%f785, %f786, %f787, %f788};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f489, %f490, %f491, %f492},
{%r135, %r136, %r137, %r138},
{%r120, %r121},
{%f781, %f782, %f783, %f784};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f493, %f494, %f495, %f496},
{%r135, %r136, %r137, %r138},
{%r122, %r123},
{%f777, %f778, %f779, %f780};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f497, %f498, %f499, %f500},
{%r135, %r136, %r137, %r138},
{%r124, %r125},
{%f773, %f774, %f775, %f776};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f501, %f502, %f503, %f504},
{%r131, %r132, %r133, %r134},
{%r110, %r111},
{%f769, %f770, %f771, %f772};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f505, %f506, %f507, %f508},
{%r131, %r132, %r133, %r134},
{%r112, %r113},
{%f765, %f766, %f767, %f768};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f509, %f510, %f511, %f512},
{%r131, %r132, %r133, %r134},
{%r114, %r115},
{%f761, %f762, %f763, %f764};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f513, %f514, %f515, %f516},
{%r131, %r132, %r133, %r134},
{%r116, %r117},
{%f757, %f758, %f759, %f760};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f517, %f518, %f519, %f520},
{%r131, %r132, %r133, %r134},
{%r118, %r119},
{%f753, %f754, %f755, %f756};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f521, %f522, %f523, %f524},
{%r131, %r132, %r133, %r134},
{%r120, %r121},
{%f749, %f750, %f751, %f752};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f525, %f526, %f527, %f528},
{%r131, %r132, %r133, %r134},
{%r122, %r123},
{%f745, %f746, %f747, %f748};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f529, %f530, %f531, %f532},
{%r131, %r132, %r133, %r134},
{%r124, %r125},
{%f741, %f742, %f743, %f744};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f533, %f534, %f535, %f536},
{%r127, %r128, %r129, %r130},
{%r110, %r111},
{%f737, %f738, %f739, %f740};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f537, %f538, %f539, %f540},
{%r127, %r128, %r129, %r130},
{%r112, %r113},
{%f733, %f734, %f735, %f736};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f541, %f542, %f543, %f544},
{%r127, %r128, %r129, %r130},
{%r114, %r115},
{%f729, %f730, %f731, %f732};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f545, %f546, %f547, %f548},
{%r127, %r128, %r129, %r130},
{%r116, %r117},
{%f725, %f726, %f727, %f728};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f549, %f550, %f551, %f552},
{%r127, %r128, %r129, %r130},
{%r118, %r119},
{%f721, %f722, %f723, %f724};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f553, %f554, %f555, %f556},
{%r127, %r128, %r129, %r130},
{%r120, %r121},
{%f717, %f718, %f719, %f720};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f557, %f558, %f559, %f560},
{%r127, %r128, %r129, %r130},
{%r122, %r123},
{%f713, %f714, %f715, %f716};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f561, %f562, %f563, %f564},
{%r127, %r128, %r129, %r130},
{%r124, %r125},
{%f709, %f710, %f711, %f712};
and.b64 %rd539, %rd533, 3;
shl.b64 %rd322, %rd539, 11;
or.b64 %rd323, %rd322, %rd43;
shl.b64 %rd324, %rd323, 2;
add.s64 %rd325, %rd60, %rd324;
selp.b32 %r67, 16, 0, %p1;
cvt.u32.u64 %r66, %rd325;
// begin inline asm
cp.async.cg.shared.global [%r66], [%rd537], 16, %r67;
// end inline asm
shl.b64 %rd326, %rd539, 13;
add.s64 %rd327, %rd44, %rd326;
cvt.u32.u64 %r126, %rd327;
add.s32 %r68, %r126, 4096;
// begin inline asm
cp.async.cg.shared.global [%r68], [%rd538], 16, %r67;
// end inline asm
shl.b64 %rd328, %rd539, 12;
or.b64 %rd329, %rd328, %rd45;
shl.b64 %rd330, %rd329, 2;
add.s64 %rd331, %rd61, %rd330;
cvt.u32.u64 %r70, %rd331;
// begin inline asm
cp.async.cg.shared.global [%r70], [%rd535], 16, %r67;
// end inline asm
or.b64 %rd332, %rd328, %rd46;
shl.b64 %rd333, %rd332, 2;
add.s64 %rd334, %rd63, %rd333;
shl.b64 %rd335, %rd534, 2;
or.b64 %rd336, %rd335, 16384;
add.s64 %rd272, %rd62, %rd336;
cvt.u32.u64 %r72, %rd334;
// begin inline asm
cp.async.cg.shared.global [%r72], [%rd272], 16, %r67;
// end inline asm
or.b64 %rd337, %rd328, %rd47;
shl.b64 %rd338, %rd337, 2;
add.s64 %rd339, %rd64, %rd338;
or.b64 %rd340, %rd335, 32768;
add.s64 %rd273, %rd62, %rd340;
cvt.u32.u64 %r74, %rd339;
// begin inline asm
cp.async.cg.shared.global [%r74], [%rd273], 16, %r67;
// end inline asm
or.b64 %rd341, %rd328, %rd48;
shl.b64 %rd342, %rd341, 2;
add.s64 %rd343, %rd65, %rd342;
or.b64 %rd344, %rd335, 49152;
add.s64 %rd274, %rd62, %rd344;
cvt.u32.u64 %r76, %rd343;
// begin inline asm
cp.async.cg.shared.global [%r76], [%rd274], 16, %r67;
// end inline asm
cp.async.commit_group;
cp.async.wait_group 2;
bar.sync 0;
shl.b64 %rd345, %rd79, 13;
add.s64 %rd346, %rd137, %rd345;
add.s64 %rd347, %rd346, %rd208;
add.s64 %rd349, %rd347, %rd210;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd349];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd349+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd349+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd349+3072];
shl.b64 %rd350, %rd79, 12;
or.b64 %rd351, %rd350, %rd58;
shl.b64 %rd352, %rd351, 2;
add.s64 %rd353, %rd147, %rd352;
add.s64 %rd355, %rd353, %rd214;
ld.shared.f32 %f708, [%rd355];
or.b64 %rd356, %rd350, %rd59;
shl.b64 %rd357, %rd356, 2;
add.s64 %rd358, %rd147, %rd357;
add.s64 %rd360, %rd358, %rd218;
ld.shared.f32 %f707, [%rd360];
add.s64 %rd362, %rd353, %rd220;
ld.shared.f32 %f706, [%rd362];
add.s64 %rd364, %rd358, %rd222;
ld.shared.f32 %f705, [%rd364];
add.s64 %rd366, %rd353, %rd224;
ld.shared.f32 %f704, [%rd366];
add.s64 %rd368, %rd358, %rd226;
ld.shared.f32 %f703, [%rd368];
add.s64 %rd370, %rd353, %rd228;
ld.shared.f32 %f702, [%rd370];
add.s64 %rd372, %rd358, %rd230;
ld.shared.f32 %f701, [%rd372];
add.s64 %rd374, %rd353, %rd232;
ld.shared.f32 %f700, [%rd374];
add.s64 %rd376, %rd358, %rd234;
ld.shared.f32 %f699, [%rd376];
add.s64 %rd378, %rd353, %rd236;
ld.shared.f32 %f698, [%rd378];
add.s64 %rd380, %rd358, %rd238;
ld.shared.f32 %f697, [%rd380];
add.s64 %rd382, %rd353, %rd240;
ld.shared.f32 %f696, [%rd382];
add.s64 %rd384, %rd358, %rd242;
ld.shared.f32 %f695, [%rd384];
add.s64 %rd386, %rd353, %rd244;
ld.shared.f32 %f694, [%rd386];
add.s64 %rd388, %rd358, %rd246;
ld.shared.f32 %f693, [%rd388];
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f833, %f834, %f835, %f836},
{%r78, %r79, %r80, %r81},
{%r94, %r95},
{%f437, %f438, %f439, %f440};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f829, %f830, %f831, %f832},
{%r78, %r79, %r80, %r81},
{%r96, %r97},
{%f441, %f442, %f443, %f444};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f825, %f826, %f827, %f828},
{%r78, %r79, %r80, %r81},
{%r98, %r99},
{%f445, %f446, %f447, %f448};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f821, %f822, %f823, %f824},
{%r78, %r79, %r80, %r81},
{%r100, %r101},
{%f449, %f450, %f451, %f452};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f817, %f818, %f819, %f820},
{%r78, %r79, %r80, %r81},
{%r102, %r103},
{%f453, %f454, %f455, %f456};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f813, %f814, %f815, %f816},
{%r78, %r79, %r80, %r81},
{%r104, %r105},
{%f457, %f458, %f459, %f460};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f809, %f810, %f811, %f812},
{%r78, %r79, %r80, %r81},
{%r106, %r107},
{%f461, %f462, %f463, %f464};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f805, %f806, %f807, %f808},
{%r78, %r79, %r80, %r81},
{%r108, %r109},
{%f465, %f466, %f467, %f468};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f801, %f802, %f803, %f804},
{%r82, %r83, %r84, %r85},
{%r94, %r95},
{%f469, %f470, %f471, %f472};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f797, %f798, %f799, %f800},
{%r82, %r83, %r84, %r85},
{%r96, %r97},
{%f473, %f474, %f475, %f476};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f793, %f794, %f795, %f796},
{%r82, %r83, %r84, %r85},
{%r98, %r99},
{%f477, %f478, %f479, %f480};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f789, %f790, %f791, %f792},
{%r82, %r83, %r84, %r85},
{%r100, %r101},
{%f481, %f482, %f483, %f484};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f785, %f786, %f787, %f788},
{%r82, %r83, %r84, %r85},
{%r102, %r103},
{%f485, %f486, %f487, %f488};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f781, %f782, %f783, %f784},
{%r82, %r83, %r84, %r85},
{%r104, %r105},
{%f489, %f490, %f491, %f492};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f777, %f778, %f779, %f780},
{%r82, %r83, %r84, %r85},
{%r106, %r107},
{%f493, %f494, %f495, %f496};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f773, %f774, %f775, %f776},
{%r82, %r83, %r84, %r85},
{%r108, %r109},
{%f497, %f498, %f499, %f500};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f769, %f770, %f771, %f772},
{%r86, %r87, %r88, %r89},
{%r94, %r95},
{%f501, %f502, %f503, %f504};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f765, %f766, %f767, %f768},
{%r86, %r87, %r88, %r89},
{%r96, %r97},
{%f505, %f506, %f507, %f508};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f761, %f762, %f763, %f764},
{%r86, %r87, %r88, %r89},
{%r98, %r99},
{%f509, %f510, %f511, %f512};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f757, %f758, %f759, %f760},
{%r86, %r87, %r88, %r89},
{%r100, %r101},
{%f513, %f514, %f515, %f516};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f753, %f754, %f755, %f756},
{%r86, %r87, %r88, %r89},
{%r102, %r103},
{%f517, %f518, %f519, %f520};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f749, %f750, %f751, %f752},
{%r86, %r87, %r88, %r89},
{%r104, %r105},
{%f521, %f522, %f523, %f524};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f745, %f746, %f747, %f748},
{%r86, %r87, %r88, %r89},
{%r106, %r107},
{%f525, %f526, %f527, %f528};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f741, %f742, %f743, %f744},
{%r86, %r87, %r88, %r89},
{%r108, %r109},
{%f529, %f530, %f531, %f532};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f737, %f738, %f739, %f740},
{%r90, %r91, %r92, %r93},
{%r94, %r95},
{%f533, %f534, %f535, %f536};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f733, %f734, %f735, %f736},
{%r90, %r91, %r92, %r93},
{%r96, %r97},
{%f537, %f538, %f539, %f540};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f729, %f730, %f731, %f732},
{%r90, %r91, %r92, %r93},
{%r98, %r99},
{%f541, %f542, %f543, %f544};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f725, %f726, %f727, %f728},
{%r90, %r91, %r92, %r93},
{%r100, %r101},
{%f545, %f546, %f547, %f548};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f721, %f722, %f723, %f724},
{%r90, %r91, %r92, %r93},
{%r102, %r103},
{%f549, %f550, %f551, %f552};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f717, %f718, %f719, %f720},
{%r90, %r91, %r92, %r93},
{%r104, %r105},
{%f553, %f554, %f555, %f556};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f713, %f714, %f715, %f716},
{%r90, %r91, %r92, %r93},
{%r106, %r107},
{%f557, %f558, %f559, %f560};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f709, %f710, %f711, %f712},
{%r90, %r91, %r92, %r93},
{%r108, %r109},
{%f561, %f562, %f563, %f564};
add.s64 %rd538, %rd538, 64;
add.s64 %rd537, %rd537, 64;
add.s64 %rd535, %rd535, 65536;
add.s64 %rd534, %rd534, 16384;
add.s64 %rd533, %rd533, 1;
setp.lt.u64 %p2, %rd536, 2032;
mov.u64 %rd541, %rd79;
@%p2 bra $L__BB0_1;
add.s64 %rd389, %rd5, %rd9;
shl.b64 %rd390, %rd9, 3;
sub.s64 %rd391, %rd6, %rd390;
add.s64 %rd392, %rd391, %rd10;
cp.async.commit_group;
cp.async.wait_group 0;
bar.sync 0;
shl.b64 %rd393, %rd389, 10;
add.s64 %rd395, %rd137, %rd393;
shl.b64 %rd396, %rd392, 2;
add.s64 %rd397, %rd395, %rd396;
st.shared.v2.f32 [%rd397+49376], {%f709, %f710};
st.shared.v2.f32 [%rd397+57568], {%f711, %f712};
st.shared.v2.f32 [%rd397+49344], {%f713, %f714};
st.shared.v2.f32 [%rd397+57536], {%f715, %f716};
st.shared.v2.f32 [%rd397+49312], {%f717, %f718};
st.shared.v2.f32 [%rd397+57504], {%f719, %f720};
st.shared.v2.f32 [%rd397+49280], {%f721, %f722};
st.shared.v2.f32 [%rd397+57472], {%f723, %f724};
st.shared.v2.f32 [%rd397+49248], {%f725, %f726};
st.shared.v2.f32 [%rd397+57440], {%f727, %f728};
st.shared.v2.f32 [%rd397+49216], {%f729, %f730};
st.shared.v2.f32 [%rd397+57408], {%f731, %f732};
st.shared.v2.f32 [%rd397+49184], {%f733, %f734};
st.shared.v2.f32 [%rd397+57376], {%f735, %f736};
st.shared.v2.f32 [%rd397+49152], {%f737, %f738};
st.shared.v2.f32 [%rd397+57344], {%f739, %f740};
st.shared.v2.f32 [%rd397+32992], {%f741, %f742};
st.shared.v2.f32 [%rd397+41184], {%f743, %f744};
st.shared.v2.f32 [%rd397+32960], {%f745, %f746};
st.shared.v2.f32 [%rd397+41152], {%f747, %f748};
st.shared.v2.f32 [%rd397+32928], {%f749, %f750};
st.shared.v2.f32 [%rd397+41120], {%f751, %f752};
st.shared.v2.f32 [%rd397+32896], {%f753, %f754};
st.shared.v2.f32 [%rd397+41088], {%f755, %f756};
st.shared.v2.f32 [%rd397+32864], {%f757, %f758};
st.shared.v2.f32 [%rd397+41056], {%f759, %f760};
st.shared.v2.f32 [%rd397+32832], {%f761, %f762};
st.shared.v2.f32 [%rd397+41024], {%f763, %f764};
st.shared.v2.f32 [%rd397+32800], {%f765, %f766};
st.shared.v2.f32 [%rd397+40992], {%f767, %f768};
st.shared.v2.f32 [%rd397+32768], {%f769, %f770};
st.shared.v2.f32 [%rd397+40960], {%f771, %f772};
st.shared.v2.f32 [%rd397+16608], {%f773, %f774};
st.shared.v2.f32 [%rd397+24800], {%f775, %f776};
st.shared.v2.f32 [%rd397+16576], {%f777, %f778};
st.shared.v2.f32 [%rd397+24768], {%f779, %f780};
st.shared.v2.f32 [%rd397+16544], {%f781, %f782};
st.shared.v2.f32 [%rd397+24736], {%f783, %f784};
st.shared.v2.f32 [%rd397+16512], {%f785, %f786};
st.shared.v2.f32 [%rd397+24704], {%f787, %f788};
st.shared.v2.f32 [%rd397+16480], {%f789, %f790};
st.shared.v2.f32 [%rd397+24672], {%f791, %f792};
st.shared.v2.f32 [%rd397+16448], {%f793, %f794};
st.shared.v2.f32 [%rd397+24640], {%f795, %f796};
st.shared.v2.f32 [%rd397+16416], {%f797, %f798};
st.shared.v2.f32 [%rd397+24608], {%f799, %f800};
st.shared.v2.f32 [%rd397+16384], {%f801, %f802};
st.shared.v2.f32 [%rd397+24576], {%f803, %f804};
st.shared.v2.f32 [%rd397+224], {%f805, %f806};
st.shared.v2.f32 [%rd397+8416], {%f807, %f808};
st.shared.v2.f32 [%rd397+192], {%f809, %f810};
st.shared.v2.f32 [%rd397+8384], {%f811, %f812};
st.shared.v2.f32 [%rd397+160], {%f813, %f814};
st.shared.v2.f32 [%rd397+8352], {%f815, %f816};
st.shared.v2.f32 [%rd397+128], {%f817, %f818};
st.shared.v2.f32 [%rd397+8320], {%f819, %f820};
st.shared.v2.f32 [%rd397+96], {%f821, %f822};
st.shared.v2.f32 [%rd397+8288], {%f823, %f824};
st.shared.v2.f32 [%rd397+64], {%f825, %f826};
st.shared.v2.f32 [%rd397+8256], {%f827, %f828};
st.shared.v2.f32 [%rd397+32], {%f829, %f830};
st.shared.v2.f32 [%rd397+8224], {%f831, %f832};
st.shared.v2.f32 [%rd397], {%f833, %f834};
st.shared.v2.f32 [%rd397+8192], {%f835, %f836};
bar.sync 0;
shl.b64 %rd398, %rd45, 2;
add.s64 %rd399, %rd137, %rd398;
add.s64 %rd401, %rd399, %rd149;
ld.shared.v4.f32 {%f565, %f566, %f567, %f568}, [%rd401];
or.b64 %rd402, %rd4, %rd1;
shl.b64 %rd403, %rd402, 12;
add.s64 %rd404, %rd88, %rd403;
add.s64 %rd406, %rd404, %rd154;
st.global.v4.f32 [%rd406], {%f565, %f566, %f567, %f568};
shl.b64 %rd407, %rd46, 2;
add.s64 %rd408, %rd137, %rd407;
add.s64 %rd409, %rd408, %rd149;
ld.shared.v4.f32 {%f569, %f570, %f571, %f572}, [%rd409];
or.b64 %rd410, %rd403, 16384;
add.s64 %rd411, %rd88, %rd410;
add.s64 %rd412, %rd411, %rd154;
st.global.v4.f32 [%rd412], {%f569, %f570, %f571, %f572};
shl.b64 %rd413, %rd47, 2;
add.s64 %rd414, %rd137, %rd413;
add.s64 %rd415, %rd414, %rd149;
ld.shared.v4.f32 {%f573, %f574, %f575, %f576}, [%rd415];
or.b64 %rd416, %rd403, 32768;
add.s64 %rd417, %rd88, %rd416;
add.s64 %rd418, %rd417, %rd154;
st.global.v4.f32 [%rd418], {%f573, %f574, %f575, %f576};
shl.b64 %rd419, %rd48, 2;
add.s64 %rd420, %rd137, %rd419;
add.s64 %rd421, %rd420, %rd149;
ld.shared.v4.f32 {%f577, %f578, %f579, %f580}, [%rd421];
or.b64 %rd422, %rd403, 49152;
add.s64 %rd423, %rd88, %rd422;
add.s64 %rd424, %rd423, %rd154;
st.global.v4.f32 [%rd424], {%f577, %f578, %f579, %f580};
shl.b64 %rd425, %rd49, 10;
add.s64 %rd426, %rd137, %rd425;
add.s64 %rd427, %rd426, %rd149;
ld.shared.v4.f32 {%f581, %f582, %f583, %f584}, [%rd427];
or.b64 %rd428, %rd403, 65536;
add.s64 %rd429, %rd88, %rd428;
add.s64 %rd430, %rd429, %rd154;
st.global.v4.f32 [%rd430], {%f581, %f582, %f583, %f584};
shl.b64 %rd431, %rd50, 10;
add.s64 %rd432, %rd137, %rd431;
add.s64 %rd433, %rd432, %rd149;
ld.shared.v4.f32 {%f585, %f586, %f587, %f588}, [%rd433];
or.b64 %rd434, %rd403, 81920;
add.s64 %rd435, %rd88, %rd434;
add.s64 %rd436, %rd435, %rd154;
st.global.v4.f32 [%rd436], {%f585, %f586, %f587, %f588};
shl.b64 %rd437, %rd51, 10;
add.s64 %rd438, %rd137, %rd437;
add.s64 %rd439, %rd438, %rd149;
ld.shared.v4.f32 {%f589, %f590, %f591, %f592}, [%rd439];
or.b64 %rd440, %rd403, 98304;
add.s64 %rd441, %rd88, %rd440;
add.s64 %rd442, %rd441, %rd154;
st.global.v4.f32 [%rd442], {%f589, %f590, %f591, %f592};
shl.b64 %rd443, %rd52, 10;
add.s64 %rd444, %rd137, %rd443;
add.s64 %rd445, %rd444, %rd149;
ld.shared.v4.f32 {%f593, %f594, %f595, %f596}, [%rd445];
or.b64 %rd446, %rd403, 114688;
add.s64 %rd447, %rd88, %rd446;
add.s64 %rd448, %rd447, %rd154;
st.global.v4.f32 [%rd448], {%f593, %f594, %f595, %f596};
shl.b64 %rd449, %rd53, 10;
add.s64 %rd450, %rd137, %rd449;
add.s64 %rd451, %rd450, %rd149;
ld.shared.v4.f32 {%f597, %f598, %f599, %f600}, [%rd451];
or.b64 %rd452, %rd403, 131072;
add.s64 %rd453, %rd88, %rd452;
add.s64 %rd454, %rd453, %rd154;
st.global.v4.f32 [%rd454], {%f597, %f598, %f599, %f600};
shl.b64 %rd455, %rd54, 10;
add.s64 %rd456, %rd137, %rd455;
add.s64 %rd457, %rd456, %rd149;
ld.shared.v4.f32 {%f601, %f602, %f603, %f604}, [%rd457];
or.b64 %rd458, %rd403, 147456;
add.s64 %rd459, %rd88, %rd458;
add.s64 %rd460, %rd459, %rd154;
st.global.v4.f32 [%rd460], {%f601, %f602, %f603, %f604};
shl.b64 %rd461, %rd55, 10;
add.s64 %rd462, %rd137, %rd461;
add.s64 %rd463, %rd462, %rd149;
ld.shared.v4.f32 {%f605, %f606, %f607, %f608}, [%rd463];
or.b64 %rd464, %rd403, 163840;
add.s64 %rd465, %rd88, %rd464;
add.s64 %rd466, %rd465, %rd154;
st.global.v4.f32 [%rd466], {%f605, %f606, %f607, %f608};
shl.b64 %rd467, %rd56, 10;
add.s64 %rd468, %rd137, %rd467;
add.s64 %rd469, %rd468, %rd149;
ld.shared.v4.f32 {%f609, %f610, %f611, %f612}, [%rd469];
or.b64 %rd470, %rd403, 180224;
add.s64 %rd471, %rd88, %rd470;
add.s64 %rd472, %rd471, %rd154;
st.global.v4.f32 [%rd472], {%f609, %f610, %f611, %f612};
ld.shared.v4.f32 {%f613, %f614, %f615, %f616}, [%rd401+49152];
or.b64 %rd473, %rd403, 196608;
add.s64 %rd474, %rd88, %rd473;
add.s64 %rd475, %rd474, %rd154;
st.global.v4.f32 [%rd475], {%f613, %f614, %f615, %f616};
ld.shared.v4.f32 {%f617, %f618, %f619, %f620}, [%rd401+53248];
or.b64 %rd476, %rd403, 212992;
add.s64 %rd477, %rd88, %rd476;
add.s64 %rd478, %rd477, %rd154;
st.global.v4.f32 [%rd478], {%f617, %f618, %f619, %f620};
ld.shared.v4.f32 {%f621, %f622, %f623, %f624}, [%rd401+57344];
or.b64 %rd479, %rd403, 229376;
add.s64 %rd480, %rd88, %rd479;
add.s64 %rd481, %rd480, %rd154;
st.global.v4.f32 [%rd481], {%f621, %f622, %f623, %f624};
ld.shared.v4.f32 {%f625, %f626, %f627, %f628}, [%rd401+61440];
or.b64 %rd482, %rd403, 245760;
add.s64 %rd483, %rd88, %rd482;
add.s64 %rd484, %rd483, %rd154;
st.global.v4.f32 [%rd484], {%f625, %f626, %f627, %f628};
ld.shared.v4.f32 {%f629, %f630, %f631, %f632}, [%rd401+65536];
or.b64 %rd485, %rd403, 262144;
add.s64 %rd486, %rd88, %rd485;
add.s64 %rd487, %rd486, %rd154;
st.global.v4.f32 [%rd487], {%f629, %f630, %f631, %f632};
ld.shared.v4.f32 {%f633, %f634, %f635, %f636}, [%rd401+69632];
or.b64 %rd488, %rd403, 278528;
add.s64 %rd489, %rd88, %rd488;
add.s64 %rd490, %rd489, %rd154;
st.global.v4.f32 [%rd490], {%f633, %f634, %f635, %f636};
ld.shared.v4.f32 {%f637, %f638, %f639, %f640}, [%rd401+73728];
or.b64 %rd491, %rd403, 294912;
add.s64 %rd492, %rd88, %rd491;
add.s64 %rd493, %rd492, %rd154;
st.global.v4.f32 [%rd493], {%f637, %f638, %f639, %f640};
ld.shared.v4.f32 {%f641, %f642, %f643, %f644}, [%rd401+77824];
or.b64 %rd494, %rd403, 311296;
add.s64 %rd495, %rd88, %rd494;
add.s64 %rd496, %rd495, %rd154;
st.global.v4.f32 [%rd496], {%f641, %f642, %f643, %f644};
ld.shared.v4.f32 {%f645, %f646, %f647, %f648}, [%rd401+81920];
or.b64 %rd497, %rd403, 327680;
add.s64 %rd498, %rd88, %rd497;
add.s64 %rd499, %rd498, %rd154;
st.global.v4.f32 [%rd499], {%f645, %f646, %f647, %f648};
ld.shared.v4.f32 {%f649, %f650, %f651, %f652}, [%rd401+86016];
or.b64 %rd500, %rd403, 344064;
add.s64 %rd501, %rd88, %rd500;
add.s64 %rd502, %rd501, %rd154;
st.global.v4.f32 [%rd502], {%f649, %f650, %f651, %f652};
ld.shared.v4.f32 {%f653, %f654, %f655, %f656}, [%rd401+90112];
or.b64 %rd503, %rd403, 360448;
add.s64 %rd504, %rd88, %rd503;
add.s64 %rd505, %rd504, %rd154;
st.global.v4.f32 [%rd505], {%f653, %f654, %f655, %f656};
ld.shared.v4.f32 {%f657, %f658, %f659, %f660}, [%rd401+94208];
or.b64 %rd506, %rd403, 376832;
add.s64 %rd507, %rd88, %rd506;
add.s64 %rd508, %rd507, %rd154;
st.global.v4.f32 [%rd508], {%f657, %f658, %f659, %f660};
ld.shared.v4.f32 {%f661, %f662, %f663, %f664}, [%rd401+98304];
or.b64 %rd509, %rd403, 393216;
add.s64 %rd510, %rd88, %rd509;
add.s64 %rd511, %rd510, %rd154;
st.global.v4.f32 [%rd511], {%f661, %f662, %f663, %f664};
ld.shared.v4.f32 {%f665, %f666, %f667, %f668}, [%rd401+102400];
or.b64 %rd512, %rd403, 409600;
add.s64 %rd513, %rd88, %rd512;
add.s64 %rd514, %rd513, %rd154;
st.global.v4.f32 [%rd514], {%f665, %f666, %f667, %f668};
ld.shared.v4.f32 {%f669, %f670, %f671, %f672}, [%rd401+106496];
or.b64 %rd515, %rd403, 425984;
add.s64 %rd516, %rd88, %rd515;
add.s64 %rd517, %rd516, %rd154;
st.global.v4.f32 [%rd517], {%f669, %f670, %f671, %f672};
ld.shared.v4.f32 {%f673, %f674, %f675, %f676}, [%rd401+110592];
or.b64 %rd518, %rd403, 442368;
add.s64 %rd519, %rd88, %rd518;
add.s64 %rd520, %rd519, %rd154;
st.global.v4.f32 [%rd520], {%f673, %f674, %f675, %f676};
ld.shared.v4.f32 {%f677, %f678, %f679, %f680}, [%rd401+114688];
or.b64 %rd521, %rd403, 458752;
add.s64 %rd522, %rd88, %rd521;
add.s64 %rd523, %rd522, %rd154;
st.global.v4.f32 [%rd523], {%f677, %f678, %f679, %f680};
ld.shared.v4.f32 {%f681, %f682, %f683, %f684}, [%rd401+118784];
or.b64 %rd524, %rd403, 475136;
add.s64 %rd525, %rd88, %rd524;
add.s64 %rd526, %rd525, %rd154;
st.global.v4.f32 [%rd526], {%f681, %f682, %f683, %f684};
ld.shared.v4.f32 {%f685, %f686, %f687, %f688}, [%rd401+122880];
or.b64 %rd527, %rd403, 491520;
add.s64 %rd528, %rd88, %rd527;
add.s64 %rd529, %rd528, %rd154;
st.global.v4.f32 [%rd529], {%f685, %f686, %f687, %f688};
ld.shared.v4.f32 {%f689, %f690, %f691, %f692}, [%rd401+126976];
or.b64 %rd530, %rd403, 507904;
add.s64 %rd531, %rd88, %rd530;
add.s64 %rd532, %rd531, %rd154;
st.global.v4.f32 [%rd532], {%f689, %f690, %f691, %f692};
bar.sync 0;
ret;
}
//
// Generated by LLVM NVPTX Back-End
//
.version 7.6
.target sm_80
.address_size 64
// .globl matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32
.extern .shared .align 16 .b8 __dynamic_shared_memory__[];
.visible .entry matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32(
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0,
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1,
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2
)
.maxntid 128, 2, 1
{
.reg .pred %p<3>;
.reg .b32 %r<143>;
.reg .f32 %f<837>;
.reg .b64 %rd<438>;
ld.param.u64 %rd82, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2];
ld.param.u64 %rd88, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0];
ld.param.u64 %rd89, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1];
mov.u32 %r49, %tid.x;
cvt.u64.u32 %rd90, %r49;
mov.u32 %r50, %tid.y;
mov.u32 %r51, %ctaid.y;
mov.u32 %r52, %ctaid.x;
shl.b32 %r53, %r51, 7;
cvt.u64.u32 %rd1, %r53;
mul.wide.u32 %rd91, %r50, 32;
shr.u64 %rd92, %rd90, 2;
or.b64 %rd93, %rd92, %rd91;
or.b64 %rd94, %rd93, %rd1;
mul.wide.u32 %rd95, %r49, 4;
shl.b64 %rd96, %rd92, 4;
sub.s64 %rd97, %rd95, %rd96;
shr.u64 %rd98, %rd90, 1;
and.b64 %rd99, %rd98, 12;
xor.b64 %rd100, %rd97, %rd99;
mul.wide.u32 %rd101, %r52, 256;
shr.u64 %rd102, %rd90, 6;
shl.b64 %rd103, %rd102, 8;
sub.s64 %rd2, %rd95, %rd103;
add.s64 %rd3, %rd2, %rd101;
mul.wide.u32 %rd104, %r50, 2;
or.b64 %rd4, %rd104, %rd102;
or.b64 %rd105, %rd4, 4;
or.b64 %rd106, %rd4, 8;
or.b64 %rd107, %rd4, 12;
mov.u32 %r54, %laneid;
cvt.s64.s32 %rd108, %r54;
mul.wide.u32 %rd5, %r50, 64;
add.s64 %rd109, %rd5, %rd108;
shr.s32 %r55, %r54, 31;
xor.b32 %r56, %r55, %r54;
shr.s32 %r57, %r56, 31;
shr.u32 %r58, %r57, 28;
add.s32 %r59, %r56, %r58;
shr.s32 %r60, %r59, 4;
xor.b32 %r61, %r60, %r55;
mul.wide.s32 %rd110, %r61, 4;
mul.wide.s32 %rd6, %r54, 2;
and.b64 %rd111, %rd6, 12;
xor.b64 %rd7, %rd110, %rd111;
add.s64 %rd112, %rd110, 8;
xor.b64 %rd8, %rd112, %rd111;
shr.u32 %r62, %r57, 30;
add.s32 %r63, %r56, %r62;
shr.s32 %r64, %r63, 2;
xor.b32 %r65, %r64, %r55;
cvt.s64.s32 %rd9, %r65;
mul.wide.u32 %rd113, %r49, 2;
and.b64 %rd10, %rd113, 192;
add.s64 %rd114, %rd10, %rd9;
and.b64 %rd115, %rd108, 3;
shl.b64 %rd116, %rd115, 2;
xor.b64 %rd11, %rd114, %rd116;
or.b64 %rd117, %rd115, 4;
shl.b64 %rd118, %rd117, 2;
xor.b64 %rd12, %rd114, %rd118;
or.b64 %rd119, %rd115, 8;
shl.b64 %rd120, %rd119, 2;
xor.b64 %rd13, %rd114, %rd120;
or.b64 %rd121, %rd115, 12;
shl.b64 %rd122, %rd121, 2;
xor.b64 %rd14, %rd114, %rd122;
add.s64 %rd123, %rd114, 8;
xor.b64 %rd15, %rd123, %rd116;
xor.b64 %rd16, %rd123, %rd118;
xor.b64 %rd17, %rd123, %rd120;
xor.b64 %rd18, %rd123, %rd122;
add.s64 %rd124, %rd114, 16;
xor.b64 %rd19, %rd124, %rd116;
xor.b64 %rd20, %rd124, %rd118;
xor.b64 %rd21, %rd124, %rd120;
xor.b64 %rd22, %rd124, %rd122;
add.s64 %rd125, %rd114, 24;
xor.b64 %rd23, %rd125, %rd116;
xor.b64 %rd24, %rd125, %rd118;
xor.b64 %rd25, %rd125, %rd120;
xor.b64 %rd26, %rd125, %rd122;
add.s64 %rd126, %rd114, 32;
xor.b64 %rd27, %rd126, %rd116;
xor.b64 %rd28, %rd126, %rd118;
xor.b64 %rd29, %rd126, %rd120;
xor.b64 %rd30, %rd126, %rd122;
add.s64 %rd127, %rd114, 40;
xor.b64 %rd31, %rd127, %rd116;
xor.b64 %rd32, %rd127, %rd118;
xor.b64 %rd33, %rd127, %rd120;
xor.b64 %rd34, %rd127, %rd122;
add.s64 %rd128, %rd114, 48;
xor.b64 %rd35, %rd128, %rd116;
xor.b64 %rd36, %rd128, %rd118;
xor.b64 %rd37, %rd128, %rd120;
xor.b64 %rd38, %rd128, %rd122;
add.s64 %rd129, %rd114, 56;
xor.b64 %rd39, %rd129, %rd116;
xor.b64 %rd40, %rd129, %rd118;
xor.b64 %rd41, %rd129, %rd120;
xor.b64 %rd42, %rd129, %rd122;
shl.b64 %rd43, %rd93, 4;
shl.b64 %rd130, %rd93, 6;
mov.u64 %rd131, __dynamic_shared_memory__;
shl.b64 %rd132, %rd100, 2;
add.s64 %rd60, %rd131, %rd132;
add.s64 %rd44, %rd60, %rd130;
shl.b64 %rd133, %rd94, 13;
add.s64 %rd134, %rd88, %rd133;
shl.b64 %rd135, %rd97, 2;
add.s64 %rd136, %rd134, %rd135;
cp.async.cg.shared.global [%rd44], [%rd136], 16;
add.s64 %rd137, %rd44, 4096;
add.s64 %rd138, %rd136, 524288;
cp.async.cg.shared.global [%rd137], [%rd138], 16;
shl.b64 %rd45, %rd4, 8;
add.s64 %rd139, %rd131, 32768;
shl.b64 %rd140, %rd4, 4;
shl.b64 %rd141, %rd2, 2;
xor.b64 %rd142, %rd140, %rd141;
shl.b64 %rd143, %rd4, 10;
add.s64 %rd61, %rd139, %rd142;
add.s64 %rd144, %rd61, %rd143;
shl.b64 %rd145, %rd4, 12;
add.s64 %rd146, %rd89, %rd145;
shl.b64 %rd147, %rd3, 2;
add.s64 %rd148, %rd146, %rd147;
cp.async.cg.shared.global [%rd144], [%rd148], 16;
shl.b64 %rd46, %rd105, 8;
shl.b64 %rd149, %rd105, 4;
xor.b64 %rd150, %rd149, %rd141;
shl.b64 %rd151, %rd105, 10;
add.s64 %rd62, %rd139, %rd150;
add.s64 %rd152, %rd62, %rd151;
shl.b64 %rd153, %rd105, 12;
add.s64 %rd154, %rd89, %rd153;
add.s64 %rd155, %rd154, %rd147;
cp.async.cg.shared.global [%rd152], [%rd155], 16;
shl.b64 %rd47, %rd106, 8;
shl.b64 %rd156, %rd106, 4;
xor.b64 %rd157, %rd156, %rd141;
shl.b64 %rd158, %rd106, 10;
add.s64 %rd63, %rd139, %rd157;
add.s64 %rd159, %rd63, %rd158;
shl.b64 %rd160, %rd106, 12;
add.s64 %rd161, %rd89, %rd160;
add.s64 %rd162, %rd161, %rd147;
cp.async.cg.shared.global [%rd159], [%rd162], 16;
shl.b64 %rd48, %rd107, 8;
shl.b64 %rd163, %rd107, 4;
xor.b64 %rd164, %rd163, %rd141;
shl.b64 %rd165, %rd107, 10;
add.s64 %rd64, %rd139, %rd164;
add.s64 %rd166, %rd64, %rd165;
shl.b64 %rd167, %rd107, 12;
add.s64 %rd168, %rd89, %rd167;
add.s64 %rd169, %rd168, %rd147;
cp.async.cg.shared.global [%rd166], [%rd169], 16;
cp.async.commit_group;
add.s64 %rd170, %rd44, 8192;
add.s64 %rd171, %rd136, 64;
cp.async.cg.shared.global [%rd170], [%rd171], 16;
add.s64 %rd172, %rd44, 12288;
add.s64 %rd173, %rd136, 524352;
cp.async.cg.shared.global [%rd172], [%rd173], 16;
or.b64 %rd49, %rd4, 16;
add.s64 %rd174, %rd144, 16384;
shl.b64 %rd175, %rd49, 12;
add.s64 %rd176, %rd89, %rd175;
add.s64 %rd177, %rd176, %rd147;
cp.async.cg.shared.global [%rd174], [%rd177], 16;
or.b64 %rd50, %rd4, 20;
add.s64 %rd178, %rd152, 16384;
shl.b64 %rd179, %rd50, 12;
add.s64 %rd180, %rd89, %rd179;
add.s64 %rd181, %rd180, %rd147;
cp.async.cg.shared.global [%rd178], [%rd181], 16;
or.b64 %rd51, %rd4, 24;
add.s64 %rd182, %rd159, 16384;
shl.b64 %rd183, %rd51, 12;
add.s64 %rd184, %rd89, %rd183;
add.s64 %rd185, %rd184, %rd147;
cp.async.cg.shared.global [%rd182], [%rd185], 16;
or.b64 %rd52, %rd4, 28;
add.s64 %rd186, %rd166, 16384;
shl.b64 %rd187, %rd52, 12;
add.s64 %rd188, %rd89, %rd187;
add.s64 %rd189, %rd188, %rd147;
cp.async.cg.shared.global [%rd186], [%rd189], 16;
cp.async.commit_group;
add.s64 %rd190, %rd44, 16384;
add.s64 %rd191, %rd136, 128;
cp.async.cg.shared.global [%rd190], [%rd191], 16;
add.s64 %rd192, %rd44, 20480;
add.s64 %rd193, %rd136, 524416;
cp.async.cg.shared.global [%rd192], [%rd193], 16;
or.b64 %rd53, %rd4, 32;
add.s64 %rd194, %rd144, 32768;
shl.b64 %rd195, %rd53, 12;
add.s64 %rd196, %rd89, %rd195;
add.s64 %rd197, %rd196, %rd147;
cp.async.cg.shared.global [%rd194], [%rd197], 16;
or.b64 %rd54, %rd4, 36;
add.s64 %rd198, %rd152, 32768;
shl.b64 %rd199, %rd54, 12;
add.s64 %rd200, %rd89, %rd199;
add.s64 %rd201, %rd200, %rd147;
cp.async.cg.shared.global [%rd198], [%rd201], 16;
or.b64 %rd55, %rd4, 40;
add.s64 %rd202, %rd159, 32768;
shl.b64 %rd203, %rd55, 12;
add.s64 %rd204, %rd89, %rd203;
add.s64 %rd205, %rd204, %rd147;
cp.async.cg.shared.global [%rd202], [%rd205], 16;
or.b64 %rd56, %rd4, 44;
add.s64 %rd206, %rd166, 32768;
shl.b64 %rd207, %rd56, 12;
add.s64 %rd208, %rd89, %rd207;
add.s64 %rd209, %rd208, %rd147;
cp.async.cg.shared.global [%rd206], [%rd209], 16;
cp.async.commit_group;
cp.async.wait_group 2;
bar.sync 0;
mul.wide.s32 %rd210, %r61, 256;
shl.b64 %rd211, %rd109, 4;
sub.s64 %rd57, %rd211, %rd210;
shl.b64 %rd212, %rd57, 2;
add.s64 %rd213, %rd131, %rd212;
shl.b64 %rd214, %rd7, 2;
add.s64 %rd215, %rd213, %rd214;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd215];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd215+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd215+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd215+3072];
shl.b64 %rd58, %rd115, 8;
shl.b64 %rd216, %rd115, 10;
add.s64 %rd217, %rd139, %rd216;
shl.b64 %rd218, %rd11, 2;
add.s64 %rd219, %rd217, %rd218;
ld.shared.f32 %f708, [%rd219];
shl.b64 %rd59, %rd117, 8;
shl.b64 %rd220, %rd117, 10;
add.s64 %rd221, %rd139, %rd220;
shl.b64 %rd222, %rd12, 2;
add.s64 %rd223, %rd221, %rd222;
ld.shared.f32 %f707, [%rd223];
shl.b64 %rd224, %rd15, 2;
add.s64 %rd225, %rd217, %rd224;
ld.shared.f32 %f706, [%rd225];
shl.b64 %rd226, %rd16, 2;
add.s64 %rd227, %rd221, %rd226;
ld.shared.f32 %f705, [%rd227];
shl.b64 %rd228, %rd19, 2;
add.s64 %rd229, %rd217, %rd228;
ld.shared.f32 %f704, [%rd229];
shl.b64 %rd230, %rd20, 2;
add.s64 %rd231, %rd221, %rd230;
ld.shared.f32 %f703, [%rd231];
shl.b64 %rd232, %rd23, 2;
add.s64 %rd233, %rd217, %rd232;
ld.shared.f32 %f702, [%rd233];
shl.b64 %rd234, %rd24, 2;
add.s64 %rd235, %rd221, %rd234;
ld.shared.f32 %f701, [%rd235];
shl.b64 %rd236, %rd27, 2;
add.s64 %rd237, %rd217, %rd236;
ld.shared.f32 %f700, [%rd237];
shl.b64 %rd238, %rd28, 2;
add.s64 %rd239, %rd221, %rd238;
ld.shared.f32 %f699, [%rd239];
shl.b64 %rd240, %rd31, 2;
add.s64 %rd241, %rd217, %rd240;
ld.shared.f32 %f698, [%rd241];
shl.b64 %rd242, %rd32, 2;
add.s64 %rd243, %rd221, %rd242;
ld.shared.f32 %f697, [%rd243];
shl.b64 %rd244, %rd35, 2;
add.s64 %rd245, %rd217, %rd244;
ld.shared.f32 %f696, [%rd245];
shl.b64 %rd246, %rd36, 2;
add.s64 %rd247, %rd221, %rd246;
ld.shared.f32 %f695, [%rd247];
shl.b64 %rd248, %rd39, 2;
add.s64 %rd249, %rd217, %rd248;
ld.shared.f32 %f694, [%rd249];
shl.b64 %rd250, %rd40, 2;
add.s64 %rd251, %rd221, %rd250;
ld.shared.f32 %f693, [%rd251];
shl.b64 %rd65, %rd119, 8;
shl.b64 %rd66, %rd121, 8;
mul.wide.u32 %rd67, %r49, 16;
mul.wide.u32 %rd252, %r51, 1048576;
mul.wide.u32 %rd253, %r50, 262144;
add.s64 %rd254, %rd252, %rd253;
mul.lo.s64 %rd255, %rd92, 8128;
add.s64 %rd256, %rd254, %rd255;
add.s64 %rd434, %rd88, %rd256;
mul.wide.u32 %rd257, %r50, 8192;
mul.lo.s64 %rd258, %rd102, 3072;
add.s64 %rd259, %rd257, %rd258;
mul.wide.u32 %rd260, %r52, 1024;
add.s64 %rd261, %rd259, %rd260;
add.s64 %rd432, %rd89, %rd261;
mov.f32 %f709, 0f00000000;
mov.u64 %rd437, 0;
mov.u64 %rd436, 1;
mov.u64 %rd435, 2;
mov.u64 %rd433, -16;
mov.u64 %rd431, 3;
shl.b64 %rd273, %rd8, 2;
shl.b64 %rd280, %rd13, 2;
shl.b64 %rd285, %rd14, 2;
shl.b64 %rd287, %rd17, 2;
shl.b64 %rd289, %rd18, 2;
shl.b64 %rd291, %rd21, 2;
shl.b64 %rd293, %rd22, 2;
shl.b64 %rd295, %rd25, 2;
shl.b64 %rd297, %rd26, 2;
shl.b64 %rd299, %rd29, 2;
shl.b64 %rd301, %rd30, 2;
shl.b64 %rd303, %rd33, 2;
shl.b64 %rd305, %rd34, 2;
shl.b64 %rd307, %rd37, 2;
shl.b64 %rd309, %rd38, 2;
shl.b64 %rd311, %rd41, 2;
shl.b64 %rd313, %rd42, 2;
mov.f32 %f710, %f709;
mov.f32 %f711, %f709;
mov.f32 %f712, %f709;
mov.f32 %f713, %f709;
mov.f32 %f714, %f709;
mov.f32 %f715, %f709;
mov.f32 %f716, %f709;
mov.f32 %f717, %f709;
mov.f32 %f718, %f709;
mov.f32 %f719, %f709;
mov.f32 %f720, %f709;
mov.f32 %f721, %f709;
mov.f32 %f722, %f709;
mov.f32 %f723, %f709;
mov.f32 %f724, %f709;
mov.f32 %f725, %f709;
mov.f32 %f726, %f709;
mov.f32 %f727, %f709;
mov.f32 %f728, %f709;
mov.f32 %f729, %f709;
mov.f32 %f730, %f709;
mov.f32 %f731, %f709;
mov.f32 %f732, %f709;
mov.f32 %f733, %f709;
mov.f32 %f734, %f709;
mov.f32 %f735, %f709;
mov.f32 %f736, %f709;
mov.f32 %f737, %f709;
mov.f32 %f738, %f709;
mov.f32 %f739, %f709;
mov.f32 %f740, %f709;
mov.f32 %f741, %f709;
mov.f32 %f742, %f709;
mov.f32 %f743, %f709;
mov.f32 %f744, %f709;
mov.f32 %f745, %f709;
mov.f32 %f746, %f709;
mov.f32 %f747, %f709;
mov.f32 %f748, %f709;
mov.f32 %f749, %f709;
mov.f32 %f750, %f709;
mov.f32 %f751, %f709;
mov.f32 %f752, %f709;
mov.f32 %f753, %f709;
mov.f32 %f754, %f709;
mov.f32 %f755, %f709;
mov.f32 %f756, %f709;
mov.f32 %f757, %f709;
mov.f32 %f758, %f709;
mov.f32 %f759, %f709;
mov.f32 %f760, %f709;
mov.f32 %f761, %f709;
mov.f32 %f762, %f709;
mov.f32 %f763, %f709;
mov.f32 %f764, %f709;
mov.f32 %f765, %f709;
mov.f32 %f766, %f709;
mov.f32 %f767, %f709;
mov.f32 %f768, %f709;
mov.f32 %f769, %f709;
mov.f32 %f770, %f709;
mov.f32 %f771, %f709;
mov.f32 %f772, %f709;
mov.f32 %f773, %f709;
mov.f32 %f774, %f709;
mov.f32 %f775, %f709;
mov.f32 %f776, %f709;
mov.f32 %f777, %f709;
mov.f32 %f778, %f709;
mov.f32 %f779, %f709;
mov.f32 %f780, %f709;
mov.f32 %f781, %f709;
mov.f32 %f782, %f709;
mov.f32 %f783, %f709;
mov.f32 %f784, %f709;
mov.f32 %f785, %f709;
mov.f32 %f786, %f709;
mov.f32 %f787, %f709;
mov.f32 %f788, %f709;
mov.f32 %f789, %f709;
mov.f32 %f790, %f709;
mov.f32 %f791, %f709;
mov.f32 %f792, %f709;
mov.f32 %f793, %f709;
mov.f32 %f794, %f709;
mov.f32 %f795, %f709;
mov.f32 %f796, %f709;
mov.f32 %f797, %f709;
mov.f32 %f798, %f709;
mov.f32 %f799, %f709;
mov.f32 %f800, %f709;
mov.f32 %f801, %f709;
mov.f32 %f802, %f709;
mov.f32 %f803, %f709;
mov.f32 %f804, %f709;
mov.f32 %f805, %f709;
mov.f32 %f806, %f709;
mov.f32 %f807, %f709;
mov.f32 %f808, %f709;
mov.f32 %f809, %f709;
mov.f32 %f810, %f709;
mov.f32 %f811, %f709;
mov.f32 %f812, %f709;
mov.f32 %f813, %f709;
mov.f32 %f814, %f709;
mov.f32 %f815, %f709;
mov.f32 %f816, %f709;
mov.f32 %f817, %f709;
mov.f32 %f818, %f709;
mov.f32 %f819, %f709;
mov.f32 %f820, %f709;
mov.f32 %f821, %f709;
mov.f32 %f822, %f709;
mov.f32 %f823, %f709;
mov.f32 %f824, %f709;
mov.f32 %f825, %f709;
mov.f32 %f826, %f709;
mov.f32 %f827, %f709;
mov.f32 %f828, %f709;
mov.f32 %f829, %f709;
mov.f32 %f830, %f709;
mov.f32 %f831, %f709;
mov.f32 %f832, %f709;
mov.f32 %f833, %f709;
mov.f32 %f834, %f709;
mov.f32 %f835, %f709;
mov.f32 %f836, %f709;
$L__BB0_1:
mov.u64 %rd75, %rd436;
mov.u64 %rd436, %rd435;
add.s64 %rd433, %rd433, 16;
setp.lt.u64 %p1, %rd433, 2000;
shl.b64 %rd268, %rd437, 13;
add.s64 %rd270, %rd131, %rd268;
add.s64 %rd272, %rd270, %rd212;
add.s64 %rd274, %rd272, %rd273;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r78, %r79, %r80, %r81}, [%rd274];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r82, %r83, %r84, %r85}, [%rd274+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r86, %r87, %r88, %r89}, [%rd274+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r90, %r91, %r92, %r93}, [%rd274+3072];
shl.b64 %rd275, %rd437, 12;
or.b64 %rd276, %rd275, %rd65;
shl.b64 %rd277, %rd276, 2;
add.s64 %rd279, %rd139, %rd277;
add.s64 %rd281, %rd279, %rd280;
ld.shared.u32 %r94, [%rd281];
or.b64 %rd282, %rd275, %rd66;
shl.b64 %rd283, %rd282, 2;
add.s64 %rd284, %rd139, %rd283;
add.s64 %rd286, %rd284, %rd285;
ld.shared.u32 %r95, [%rd286];
add.s64 %rd288, %rd279, %rd287;
ld.shared.u32 %r96, [%rd288];
add.s64 %rd290, %rd284, %rd289;
ld.shared.u32 %r97, [%rd290];
add.s64 %rd292, %rd279, %rd291;
ld.shared.u32 %r98, [%rd292];
add.s64 %rd294, %rd284, %rd293;
ld.shared.u32 %r99, [%rd294];
add.s64 %rd296, %rd279, %rd295;
ld.shared.u32 %r100, [%rd296];
add.s64 %rd298, %rd284, %rd297;
ld.shared.u32 %r101, [%rd298];
add.s64 %rd300, %rd279, %rd299;
ld.shared.u32 %r102, [%rd300];
add.s64 %rd302, %rd284, %rd301;
ld.shared.u32 %r103, [%rd302];
add.s64 %rd304, %rd279, %rd303;
ld.shared.u32 %r104, [%rd304];
add.s64 %rd306, %rd284, %rd305;
ld.shared.u32 %r105, [%rd306];
add.s64 %rd308, %rd279, %rd307;
ld.shared.u32 %r106, [%rd308];
add.s64 %rd310, %rd284, %rd309;
ld.shared.u32 %r107, [%rd310];
add.s64 %rd312, %rd279, %rd311;
ld.shared.u32 %r108, [%rd312];
add.s64 %rd314, %rd284, %rd313;
ld.shared.u32 %r109, [%rd314];
mov.b32 %r110, %f708;
mov.b32 %r111, %f707;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f437, %f438, %f439, %f440},
{%r139, %r140, %r141, %r142},
{%r110, %r111},
{%f833, %f834, %f835, %f836};
mov.b32 %r112, %f706;
mov.b32 %r113, %f705;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f441, %f442, %f443, %f444},
{%r139, %r140, %r141, %r142},
{%r112, %r113},
{%f829, %f830, %f831, %f832};
mov.b32 %r114, %f704;
mov.b32 %r115, %f703;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f445, %f446, %f447, %f448},
{%r139, %r140, %r141, %r142},
{%r114, %r115},
{%f825, %f826, %f827, %f828};
mov.b32 %r116, %f702;
mov.b32 %r117, %f701;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f449, %f450, %f451, %f452},
{%r139, %r140, %r141, %r142},
{%r116, %r117},
{%f821, %f822, %f823, %f824};
mov.b32 %r118, %f700;
mov.b32 %r119, %f699;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f453, %f454, %f455, %f456},
{%r139, %r140, %r141, %r142},
{%r118, %r119},
{%f817, %f818, %f819, %f820};
mov.b32 %r120, %f698;
mov.b32 %r121, %f697;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f457, %f458, %f459, %f460},
{%r139, %r140, %r141, %r142},
{%r120, %r121},
{%f813, %f814, %f815, %f816};
mov.b32 %r122, %f696;
mov.b32 %r123, %f695;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f461, %f462, %f463, %f464},
{%r139, %r140, %r141, %r142},
{%r122, %r123},
{%f809, %f810, %f811, %f812};
mov.b32 %r124, %f694;
mov.b32 %r125, %f693;
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f465, %f466, %f467, %f468},
{%r139, %r140, %r141, %r142},
{%r124, %r125},
{%f805, %f806, %f807, %f808};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f469, %f470, %f471, %f472},
{%r135, %r136, %r137, %r138},
{%r110, %r111},
{%f801, %f802, %f803, %f804};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f473, %f474, %f475, %f476},
{%r135, %r136, %r137, %r138},
{%r112, %r113},
{%f797, %f798, %f799, %f800};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f477, %f478, %f479, %f480},
{%r135, %r136, %r137, %r138},
{%r114, %r115},
{%f793, %f794, %f795, %f796};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f481, %f482, %f483, %f484},
{%r135, %r136, %r137, %r138},
{%r116, %r117},
{%f789, %f790, %f791, %f792};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f485, %f486, %f487, %f488},
{%r135, %r136, %r137, %r138},
{%r118, %r119},
{%f785, %f786, %f787, %f788};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f489, %f490, %f491, %f492},
{%r135, %r136, %r137, %r138},
{%r120, %r121},
{%f781, %f782, %f783, %f784};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f493, %f494, %f495, %f496},
{%r135, %r136, %r137, %r138},
{%r122, %r123},
{%f777, %f778, %f779, %f780};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f497, %f498, %f499, %f500},
{%r135, %r136, %r137, %r138},
{%r124, %r125},
{%f773, %f774, %f775, %f776};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f501, %f502, %f503, %f504},
{%r131, %r132, %r133, %r134},
{%r110, %r111},
{%f769, %f770, %f771, %f772};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f505, %f506, %f507, %f508},
{%r131, %r132, %r133, %r134},
{%r112, %r113},
{%f765, %f766, %f767, %f768};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f509, %f510, %f511, %f512},
{%r131, %r132, %r133, %r134},
{%r114, %r115},
{%f761, %f762, %f763, %f764};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f513, %f514, %f515, %f516},
{%r131, %r132, %r133, %r134},
{%r116, %r117},
{%f757, %f758, %f759, %f760};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f517, %f518, %f519, %f520},
{%r131, %r132, %r133, %r134},
{%r118, %r119},
{%f753, %f754, %f755, %f756};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f521, %f522, %f523, %f524},
{%r131, %r132, %r133, %r134},
{%r120, %r121},
{%f749, %f750, %f751, %f752};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f525, %f526, %f527, %f528},
{%r131, %r132, %r133, %r134},
{%r122, %r123},
{%f745, %f746, %f747, %f748};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f529, %f530, %f531, %f532},
{%r131, %r132, %r133, %r134},
{%r124, %r125},
{%f741, %f742, %f743, %f744};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f533, %f534, %f535, %f536},
{%r127, %r128, %r129, %r130},
{%r110, %r111},
{%f737, %f738, %f739, %f740};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f537, %f538, %f539, %f540},
{%r127, %r128, %r129, %r130},
{%r112, %r113},
{%f733, %f734, %f735, %f736};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f541, %f542, %f543, %f544},
{%r127, %r128, %r129, %r130},
{%r114, %r115},
{%f729, %f730, %f731, %f732};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f545, %f546, %f547, %f548},
{%r127, %r128, %r129, %r130},
{%r116, %r117},
{%f725, %f726, %f727, %f728};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f549, %f550, %f551, %f552},
{%r127, %r128, %r129, %r130},
{%r118, %r119},
{%f721, %f722, %f723, %f724};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f553, %f554, %f555, %f556},
{%r127, %r128, %r129, %r130},
{%r120, %r121},
{%f717, %f718, %f719, %f720};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f557, %f558, %f559, %f560},
{%r127, %r128, %r129, %r130},
{%r122, %r123},
{%f713, %f714, %f715, %f716};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f561, %f562, %f563, %f564},
{%r127, %r128, %r129, %r130},
{%r124, %r125},
{%f709, %f710, %f711, %f712};
and.b64 %rd435, %rd431, 3;
shl.b64 %rd315, %rd435, 11;
or.b64 %rd316, %rd315, %rd43;
shl.b64 %rd317, %rd316, 2;
add.s64 %rd318, %rd60, %rd317;
add.s64 %rd319, %rd434, %rd67;
add.s64 %rd262, %rd319, 192;
selp.b32 %r67, 16, 0, %p1;
cvt.u32.u64 %r66, %rd318;
// begin inline asm
cp.async.cg.shared.global [%r66], [%rd262], 16, %r67;
// end inline asm
shl.b64 %rd320, %rd435, 13;
add.s64 %rd321, %rd44, %rd320;
add.s64 %rd263, %rd319, 524480;
cvt.u32.u64 %r126, %rd321;
add.s32 %r68, %r126, 4096;
// begin inline asm
cp.async.cg.shared.global [%r68], [%rd263], 16, %r67;
// end inline asm
shl.b64 %rd322, %rd435, 12;
or.b64 %rd323, %rd322, %rd45;
shl.b64 %rd324, %rd323, 2;
add.s64 %rd325, %rd61, %rd324;
add.s64 %rd326, %rd432, %rd67;
add.s64 %rd264, %rd326, 196608;
cvt.u32.u64 %r70, %rd325;
// begin inline asm
cp.async.cg.shared.global [%r70], [%rd264], 16, %r67;
// end inline asm
or.b64 %rd327, %rd322, %rd46;
shl.b64 %rd328, %rd327, 2;
add.s64 %rd329, %rd62, %rd328;
add.s64 %rd265, %rd326, 212992;
cvt.u32.u64 %r72, %rd329;
// begin inline asm
cp.async.cg.shared.global [%r72], [%rd265], 16, %r67;
// end inline asm
or.b64 %rd330, %rd322, %rd47;
shl.b64 %rd331, %rd330, 2;
add.s64 %rd332, %rd63, %rd331;
add.s64 %rd266, %rd326, 229376;
cvt.u32.u64 %r74, %rd332;
// begin inline asm
cp.async.cg.shared.global [%r74], [%rd266], 16, %r67;
// end inline asm
or.b64 %rd333, %rd322, %rd48;
shl.b64 %rd334, %rd333, 2;
add.s64 %rd335, %rd64, %rd334;
add.s64 %rd267, %rd326, 245760;
cvt.u32.u64 %r76, %rd335;
// begin inline asm
cp.async.cg.shared.global [%r76], [%rd267], 16, %r67;
// end inline asm
cp.async.commit_group;
cp.async.wait_group 2;
bar.sync 0;
shl.b64 %rd336, %rd75, 13;
add.s64 %rd337, %rd131, %rd336;
add.s64 %rd338, %rd337, %rd212;
add.s64 %rd340, %rd338, %rd214;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd340];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd340+1024];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd340+2048];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd340+3072];
shl.b64 %rd341, %rd75, 12;
or.b64 %rd342, %rd341, %rd58;
shl.b64 %rd343, %rd342, 2;
add.s64 %rd344, %rd139, %rd343;
add.s64 %rd346, %rd344, %rd218;
ld.shared.f32 %f708, [%rd346];
or.b64 %rd347, %rd341, %rd59;
shl.b64 %rd348, %rd347, 2;
add.s64 %rd349, %rd139, %rd348;
add.s64 %rd351, %rd349, %rd222;
ld.shared.f32 %f707, [%rd351];
add.s64 %rd353, %rd344, %rd224;
ld.shared.f32 %f706, [%rd353];
add.s64 %rd355, %rd349, %rd226;
ld.shared.f32 %f705, [%rd355];
add.s64 %rd357, %rd344, %rd228;
ld.shared.f32 %f704, [%rd357];
add.s64 %rd359, %rd349, %rd230;
ld.shared.f32 %f703, [%rd359];
add.s64 %rd361, %rd344, %rd232;
ld.shared.f32 %f702, [%rd361];
add.s64 %rd363, %rd349, %rd234;
ld.shared.f32 %f701, [%rd363];
add.s64 %rd365, %rd344, %rd236;
ld.shared.f32 %f700, [%rd365];
add.s64 %rd367, %rd349, %rd238;
ld.shared.f32 %f699, [%rd367];
add.s64 %rd369, %rd344, %rd240;
ld.shared.f32 %f698, [%rd369];
add.s64 %rd371, %rd349, %rd242;
ld.shared.f32 %f697, [%rd371];
add.s64 %rd373, %rd344, %rd244;
ld.shared.f32 %f696, [%rd373];
add.s64 %rd375, %rd349, %rd246;
ld.shared.f32 %f695, [%rd375];
add.s64 %rd377, %rd344, %rd248;
ld.shared.f32 %f694, [%rd377];
add.s64 %rd379, %rd349, %rd250;
ld.shared.f32 %f693, [%rd379];
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f833, %f834, %f835, %f836},
{%r78, %r79, %r80, %r81},
{%r94, %r95},
{%f437, %f438, %f439, %f440};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f829, %f830, %f831, %f832},
{%r78, %r79, %r80, %r81},
{%r96, %r97},
{%f441, %f442, %f443, %f444};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f825, %f826, %f827, %f828},
{%r78, %r79, %r80, %r81},
{%r98, %r99},
{%f445, %f446, %f447, %f448};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f821, %f822, %f823, %f824},
{%r78, %r79, %r80, %r81},
{%r100, %r101},
{%f449, %f450, %f451, %f452};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f817, %f818, %f819, %f820},
{%r78, %r79, %r80, %r81},
{%r102, %r103},
{%f453, %f454, %f455, %f456};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f813, %f814, %f815, %f816},
{%r78, %r79, %r80, %r81},
{%r104, %r105},
{%f457, %f458, %f459, %f460};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f809, %f810, %f811, %f812},
{%r78, %r79, %r80, %r81},
{%r106, %r107},
{%f461, %f462, %f463, %f464};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f805, %f806, %f807, %f808},
{%r78, %r79, %r80, %r81},
{%r108, %r109},
{%f465, %f466, %f467, %f468};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f801, %f802, %f803, %f804},
{%r82, %r83, %r84, %r85},
{%r94, %r95},
{%f469, %f470, %f471, %f472};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f797, %f798, %f799, %f800},
{%r82, %r83, %r84, %r85},
{%r96, %r97},
{%f473, %f474, %f475, %f476};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f793, %f794, %f795, %f796},
{%r82, %r83, %r84, %r85},
{%r98, %r99},
{%f477, %f478, %f479, %f480};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f789, %f790, %f791, %f792},
{%r82, %r83, %r84, %r85},
{%r100, %r101},
{%f481, %f482, %f483, %f484};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f785, %f786, %f787, %f788},
{%r82, %r83, %r84, %r85},
{%r102, %r103},
{%f485, %f486, %f487, %f488};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f781, %f782, %f783, %f784},
{%r82, %r83, %r84, %r85},
{%r104, %r105},
{%f489, %f490, %f491, %f492};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f777, %f778, %f779, %f780},
{%r82, %r83, %r84, %r85},
{%r106, %r107},
{%f493, %f494, %f495, %f496};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f773, %f774, %f775, %f776},
{%r82, %r83, %r84, %r85},
{%r108, %r109},
{%f497, %f498, %f499, %f500};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f769, %f770, %f771, %f772},
{%r86, %r87, %r88, %r89},
{%r94, %r95},
{%f501, %f502, %f503, %f504};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f765, %f766, %f767, %f768},
{%r86, %r87, %r88, %r89},
{%r96, %r97},
{%f505, %f506, %f507, %f508};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f761, %f762, %f763, %f764},
{%r86, %r87, %r88, %r89},
{%r98, %r99},
{%f509, %f510, %f511, %f512};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f757, %f758, %f759, %f760},
{%r86, %r87, %r88, %r89},
{%r100, %r101},
{%f513, %f514, %f515, %f516};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f753, %f754, %f755, %f756},
{%r86, %r87, %r88, %r89},
{%r102, %r103},
{%f517, %f518, %f519, %f520};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f749, %f750, %f751, %f752},
{%r86, %r87, %r88, %r89},
{%r104, %r105},
{%f521, %f522, %f523, %f524};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f745, %f746, %f747, %f748},
{%r86, %r87, %r88, %r89},
{%r106, %r107},
{%f525, %f526, %f527, %f528};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f741, %f742, %f743, %f744},
{%r86, %r87, %r88, %r89},
{%r108, %r109},
{%f529, %f530, %f531, %f532};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f737, %f738, %f739, %f740},
{%r90, %r91, %r92, %r93},
{%r94, %r95},
{%f533, %f534, %f535, %f536};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f733, %f734, %f735, %f736},
{%r90, %r91, %r92, %r93},
{%r96, %r97},
{%f537, %f538, %f539, %f540};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f729, %f730, %f731, %f732},
{%r90, %r91, %r92, %r93},
{%r98, %r99},
{%f541, %f542, %f543, %f544};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f725, %f726, %f727, %f728},
{%r90, %r91, %r92, %r93},
{%r100, %r101},
{%f545, %f546, %f547, %f548};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f721, %f722, %f723, %f724},
{%r90, %r91, %r92, %r93},
{%r102, %r103},
{%f549, %f550, %f551, %f552};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f717, %f718, %f719, %f720},
{%r90, %r91, %r92, %r93},
{%r104, %r105},
{%f553, %f554, %f555, %f556};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f713, %f714, %f715, %f716},
{%r90, %r91, %r92, %r93},
{%r106, %r107},
{%f557, %f558, %f559, %f560};
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32
{%f709, %f710, %f711, %f712},
{%r90, %r91, %r92, %r93},
{%r108, %r109},
{%f561, %f562, %f563, %f564};
add.s64 %rd434, %rd434, 64;
add.s64 %rd432, %rd432, 65536;
add.s64 %rd431, %rd431, 1;
setp.lt.u64 %p2, %rd433, 2032;
mov.u64 %rd437, %rd75;
@%p2 bra $L__BB0_1;
add.s64 %rd380, %rd5, %rd9;
shl.b64 %rd381, %rd9, 3;
sub.s64 %rd382, %rd6, %rd381;
add.s64 %rd383, %rd382, %rd10;
cp.async.commit_group;
cp.async.wait_group 0;
bar.sync 0;
shl.b64 %rd384, %rd380, 10;
add.s64 %rd386, %rd131, %rd384;
shl.b64 %rd387, %rd383, 2;
add.s64 %rd388, %rd386, %rd387;
st.shared.v2.f32 [%rd388+49376], {%f709, %f710};
st.shared.v2.f32 [%rd388+57568], {%f711, %f712};
st.shared.v2.f32 [%rd388+49344], {%f713, %f714};
st.shared.v2.f32 [%rd388+57536], {%f715, %f716};
st.shared.v2.f32 [%rd388+49312], {%f717, %f718};
st.shared.v2.f32 [%rd388+57504], {%f719, %f720};
st.shared.v2.f32 [%rd388+49280], {%f721, %f722};
st.shared.v2.f32 [%rd388+57472], {%f723, %f724};
st.shared.v2.f32 [%rd388+49248], {%f725, %f726};
st.shared.v2.f32 [%rd388+57440], {%f727, %f728};
st.shared.v2.f32 [%rd388+49216], {%f729, %f730};
st.shared.v2.f32 [%rd388+57408], {%f731, %f732};
st.shared.v2.f32 [%rd388+49184], {%f733, %f734};
st.shared.v2.f32 [%rd388+57376], {%f735, %f736};
st.shared.v2.f32 [%rd388+49152], {%f737, %f738};
st.shared.v2.f32 [%rd388+57344], {%f739, %f740};
st.shared.v2.f32 [%rd388+32992], {%f741, %f742};
st.shared.v2.f32 [%rd388+41184], {%f743, %f744};
st.shared.v2.f32 [%rd388+32960], {%f745, %f746};
st.shared.v2.f32 [%rd388+41152], {%f747, %f748};
st.shared.v2.f32 [%rd388+32928], {%f749, %f750};
st.shared.v2.f32 [%rd388+41120], {%f751, %f752};
st.shared.v2.f32 [%rd388+32896], {%f753, %f754};
st.shared.v2.f32 [%rd388+41088], {%f755, %f756};
st.shared.v2.f32 [%rd388+32864], {%f757, %f758};
st.shared.v2.f32 [%rd388+41056], {%f759, %f760};
st.shared.v2.f32 [%rd388+32832], {%f761, %f762};
st.shared.v2.f32 [%rd388+41024], {%f763, %f764};
st.shared.v2.f32 [%rd388+32800], {%f765, %f766};
st.shared.v2.f32 [%rd388+40992], {%f767, %f768};
st.shared.v2.f32 [%rd388+32768], {%f769, %f770};
st.shared.v2.f32 [%rd388+40960], {%f771, %f772};
st.shared.v2.f32 [%rd388+16608], {%f773, %f774};
st.shared.v2.f32 [%rd388+24800], {%f775, %f776};
st.shared.v2.f32 [%rd388+16576], {%f777, %f778};
st.shared.v2.f32 [%rd388+24768], {%f779, %f780};
st.shared.v2.f32 [%rd388+16544], {%f781, %f782};
st.shared.v2.f32 [%rd388+24736], {%f783, %f784};
st.shared.v2.f32 [%rd388+16512], {%f785, %f786};
st.shared.v2.f32 [%rd388+24704], {%f787, %f788};
st.shared.v2.f32 [%rd388+16480], {%f789, %f790};
st.shared.v2.f32 [%rd388+24672], {%f791, %f792};
st.shared.v2.f32 [%rd388+16448], {%f793, %f794};
st.shared.v2.f32 [%rd388+24640], {%f795, %f796};
st.shared.v2.f32 [%rd388+16416], {%f797, %f798};
st.shared.v2.f32 [%rd388+24608], {%f799, %f800};
st.shared.v2.f32 [%rd388+16384], {%f801, %f802};
st.shared.v2.f32 [%rd388+24576], {%f803, %f804};
st.shared.v2.f32 [%rd388+224], {%f805, %f806};
st.shared.v2.f32 [%rd388+8416], {%f807, %f808};
st.shared.v2.f32 [%rd388+192], {%f809, %f810};
st.shared.v2.f32 [%rd388+8384], {%f811, %f812};
st.shared.v2.f32 [%rd388+160], {%f813, %f814};
st.shared.v2.f32 [%rd388+8352], {%f815, %f816};
st.shared.v2.f32 [%rd388+128], {%f817, %f818};
st.shared.v2.f32 [%rd388+8320], {%f819, %f820};
st.shared.v2.f32 [%rd388+96], {%f821, %f822};
st.shared.v2.f32 [%rd388+8288], {%f823, %f824};
st.shared.v2.f32 [%rd388+64], {%f825, %f826};
st.shared.v2.f32 [%rd388+8256], {%f827, %f828};
st.shared.v2.f32 [%rd388+32], {%f829, %f830};
st.shared.v2.f32 [%rd388+8224], {%f831, %f832};
st.shared.v2.f32 [%rd388], {%f833, %f834};
st.shared.v2.f32 [%rd388+8192], {%f835, %f836};
bar.sync 0;
shl.b64 %rd389, %rd45, 2;
add.s64 %rd390, %rd131, %rd389;
add.s64 %rd392, %rd390, %rd141;
ld.shared.v4.f32 {%f565, %f566, %f567, %f568}, [%rd392];
or.b64 %rd393, %rd4, %rd1;
shl.b64 %rd394, %rd393, 12;
add.s64 %rd395, %rd82, %rd394;
add.s64 %rd397, %rd395, %rd147;
st.global.v4.f32 [%rd397], {%f565, %f566, %f567, %f568};
shl.b64 %rd398, %rd46, 2;
add.s64 %rd399, %rd131, %rd398;
add.s64 %rd400, %rd399, %rd141;
ld.shared.v4.f32 {%f569, %f570, %f571, %f572}, [%rd400];
st.global.v4.f32 [%rd397+16384], {%f569, %f570, %f571, %f572};
shl.b64 %rd401, %rd47, 2;
add.s64 %rd402, %rd131, %rd401;
add.s64 %rd403, %rd402, %rd141;
ld.shared.v4.f32 {%f573, %f574, %f575, %f576}, [%rd403];
st.global.v4.f32 [%rd397+32768], {%f573, %f574, %f575, %f576};
shl.b64 %rd404, %rd48, 2;
add.s64 %rd405, %rd131, %rd404;
add.s64 %rd406, %rd405, %rd141;
ld.shared.v4.f32 {%f577, %f578, %f579, %f580}, [%rd406];
st.global.v4.f32 [%rd397+49152], {%f577, %f578, %f579, %f580};
shl.b64 %rd407, %rd49, 10;
add.s64 %rd408, %rd131, %rd407;
add.s64 %rd409, %rd408, %rd141;
ld.shared.v4.f32 {%f581, %f582, %f583, %f584}, [%rd409];
st.global.v4.f32 [%rd397+65536], {%f581, %f582, %f583, %f584};
shl.b64 %rd410, %rd50, 10;
add.s64 %rd411, %rd131, %rd410;
add.s64 %rd412, %rd411, %rd141;
ld.shared.v4.f32 {%f585, %f586, %f587, %f588}, [%rd412];
st.global.v4.f32 [%rd397+81920], {%f585, %f586, %f587, %f588};
shl.b64 %rd413, %rd51, 10;
add.s64 %rd414, %rd131, %rd413;
add.s64 %rd415, %rd414, %rd141;
ld.shared.v4.f32 {%f589, %f590, %f591, %f592}, [%rd415];
st.global.v4.f32 [%rd397+98304], {%f589, %f590, %f591, %f592};
shl.b64 %rd416, %rd52, 10;
add.s64 %rd417, %rd131, %rd416;
add.s64 %rd418, %rd417, %rd141;
ld.shared.v4.f32 {%f593, %f594, %f595, %f596}, [%rd418];
st.global.v4.f32 [%rd397+114688], {%f593, %f594, %f595, %f596};
shl.b64 %rd419, %rd53, 10;
add.s64 %rd420, %rd131, %rd419;
add.s64 %rd421, %rd420, %rd141;
ld.shared.v4.f32 {%f597, %f598, %f599, %f600}, [%rd421];
st.global.v4.f32 [%rd397+131072], {%f597, %f598, %f599, %f600};
shl.b64 %rd422, %rd54, 10;
add.s64 %rd423, %rd131, %rd422;
add.s64 %rd424, %rd423, %rd141;
ld.shared.v4.f32 {%f601, %f602, %f603, %f604}, [%rd424];
st.global.v4.f32 [%rd397+147456], {%f601, %f602, %f603, %f604};
shl.b64 %rd425, %rd55, 10;
add.s64 %rd426, %rd131, %rd425;
add.s64 %rd427, %rd426, %rd141;
ld.shared.v4.f32 {%f605, %f606, %f607, %f608}, [%rd427];
st.global.v4.f32 [%rd397+163840], {%f605, %f606, %f607, %f608};
shl.b64 %rd428, %rd56, 10;
add.s64 %rd429, %rd131, %rd428;
add.s64 %rd430, %rd429, %rd141;
ld.shared.v4.f32 {%f609, %f610, %f611, %f612}, [%rd430];
st.global.v4.f32 [%rd397+180224], {%f609, %f610, %f611, %f612};
ld.shared.v4.f32 {%f613, %f614, %f615, %f616}, [%rd392+49152];
st.global.v4.f32 [%rd397+196608], {%f613, %f614, %f615, %f616};
ld.shared.v4.f32 {%f617, %f618, %f619, %f620}, [%rd392+53248];
st.global.v4.f32 [%rd397+212992], {%f617, %f618, %f619, %f620};
ld.shared.v4.f32 {%f621, %f622, %f623, %f624}, [%rd392+57344];
st.global.v4.f32 [%rd397+229376], {%f621, %f622, %f623, %f624};
ld.shared.v4.f32 {%f625, %f626, %f627, %f628}, [%rd392+61440];
st.global.v4.f32 [%rd397+245760], {%f625, %f626, %f627, %f628};
ld.shared.v4.f32 {%f629, %f630, %f631, %f632}, [%rd392+65536];
st.global.v4.f32 [%rd397+262144], {%f629, %f630, %f631, %f632};
ld.shared.v4.f32 {%f633, %f634, %f635, %f636}, [%rd392+69632];
st.global.v4.f32 [%rd397+278528], {%f633, %f634, %f635, %f636};
ld.shared.v4.f32 {%f637, %f638, %f639, %f640}, [%rd392+73728];
st.global.v4.f32 [%rd397+294912], {%f637, %f638, %f639, %f640};
ld.shared.v4.f32 {%f641, %f642, %f643, %f644}, [%rd392+77824];
st.global.v4.f32 [%rd397+311296], {%f641, %f642, %f643, %f644};
ld.shared.v4.f32 {%f645, %f646, %f647, %f648}, [%rd392+81920];
st.global.v4.f32 [%rd397+327680], {%f645, %f646, %f647, %f648};
ld.shared.v4.f32 {%f649, %f650, %f651, %f652}, [%rd392+86016];
st.global.v4.f32 [%rd397+344064], {%f649, %f650, %f651, %f652};
ld.shared.v4.f32 {%f653, %f654, %f655, %f656}, [%rd392+90112];
st.global.v4.f32 [%rd397+360448], {%f653, %f654, %f655, %f656};
ld.shared.v4.f32 {%f657, %f658, %f659, %f660}, [%rd392+94208];
st.global.v4.f32 [%rd397+376832], {%f657, %f658, %f659, %f660};
ld.shared.v4.f32 {%f661, %f662, %f663, %f664}, [%rd392+98304];
st.global.v4.f32 [%rd397+393216], {%f661, %f662, %f663, %f664};
ld.shared.v4.f32 {%f665, %f666, %f667, %f668}, [%rd392+102400];
st.global.v4.f32 [%rd397+409600], {%f665, %f666, %f667, %f668};
ld.shared.v4.f32 {%f669, %f670, %f671, %f672}, [%rd392+106496];
st.global.v4.f32 [%rd397+425984], {%f669, %f670, %f671, %f672};
ld.shared.v4.f32 {%f673, %f674, %f675, %f676}, [%rd392+110592];
st.global.v4.f32 [%rd397+442368], {%f673, %f674, %f675, %f676};
ld.shared.v4.f32 {%f677, %f678, %f679, %f680}, [%rd392+114688];
st.global.v4.f32 [%rd397+458752], {%f677, %f678, %f679, %f680};
ld.shared.v4.f32 {%f681, %f682, %f683, %f684}, [%rd392+118784];
st.global.v4.f32 [%rd397+475136], {%f681, %f682, %f683, %f684};
ld.shared.v4.f32 {%f685, %f686, %f687, %f688}, [%rd392+122880];
st.global.v4.f32 [%rd397+491520], {%f685, %f686, %f687, %f688};
ld.shared.v4.f32 {%f689, %f690, %f691, %f692}, [%rd392+126976];
st.global.v4.f32 [%rd397+507904], {%f689, %f690, %f691, %f692};
bar.sync 0;
ret;
}
hal.executable public @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0 {
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
hal.executable.export public @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3456x2048xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<3456x1024xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3456, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3456x2048xf32>> -> tensor<3456x2048xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
%5 = tensor.empty() : tensor<3456x1024xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3456x1024xf32>) -> tensor<3456x1024xf32>
%7 = linalg.matmul ins(%3, %4 : tensor<3456x2048xf32>, tensor<2048x1024xf32>) outs(%6 : tensor<3456x1024xf32>) -> tensor<3456x1024xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3456, 1024], strides = [1, 1] : tensor<3456x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<3456x1024xf32>>
return
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment