Last active
February 4, 2024 19:33
-
-
Save antiagainst/5157cafb6f54c763ea216d84ba56b9d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 7.6 | |
.target sm_80 | |
.address_size 64 | |
// .globl matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32 | |
.extern .shared .align 16 .b8 __dynamic_shared_memory__[]; | |
.visible .entry matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32( | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0, | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1, | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2 | |
) | |
.maxntid 128, 2, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b32 %r<143>; | |
.reg .f32 %f<837>; | |
.reg .b64 %rd<542>; | |
ld.param.u64 %rd88, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2]; | |
ld.param.u64 %rd94, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0]; | |
ld.param.u64 %rd95, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1]; | |
mov.u32 %r49, %tid.x; | |
cvt.u64.u32 %rd96, %r49; | |
mov.u32 %r50, %tid.y; | |
mov.u32 %r51, %ctaid.y; | |
mov.u32 %r52, %ctaid.x; | |
shl.b32 %r53, %r51, 7; | |
cvt.u64.u32 %rd1, %r53; | |
mul.wide.u32 %rd97, %r50, 32; | |
shr.u64 %rd98, %rd96, 2; | |
or.b64 %rd99, %rd98, %rd97; | |
or.b64 %rd100, %rd99, %rd1; | |
mul.wide.u32 %rd101, %r49, 4; | |
shl.b64 %rd102, %rd98, 4; | |
sub.s64 %rd103, %rd101, %rd102; | |
shr.u64 %rd104, %rd96, 1; | |
and.b64 %rd105, %rd104, 12; | |
xor.b64 %rd106, %rd103, %rd105; | |
mul.wide.u32 %rd107, %r52, 256; | |
shr.u64 %rd108, %rd96, 6; | |
shl.b64 %rd109, %rd108, 8; | |
sub.s64 %rd2, %rd101, %rd109; | |
add.s64 %rd3, %rd2, %rd107; | |
mul.wide.u32 %rd110, %r50, 2; | |
or.b64 %rd4, %rd110, %rd108; | |
or.b64 %rd111, %rd4, 4; | |
or.b64 %rd112, %rd4, 8; | |
or.b64 %rd113, %rd4, 12; | |
mov.u32 %r54, %laneid; | |
cvt.s64.s32 %rd114, %r54; | |
mul.wide.u32 %rd5, %r50, 64; | |
add.s64 %rd115, %rd5, %rd114; | |
shr.s32 %r55, %r54, 31; | |
xor.b32 %r56, %r55, %r54; | |
shr.s32 %r57, %r56, 31; | |
shr.u32 %r58, %r57, 28; | |
add.s32 %r59, %r56, %r58; | |
shr.s32 %r60, %r59, 4; | |
xor.b32 %r61, %r60, %r55; | |
mul.wide.s32 %rd116, %r61, 4; | |
mul.wide.s32 %rd6, %r54, 2; | |
and.b64 %rd117, %rd6, 12; | |
xor.b64 %rd7, %rd116, %rd117; | |
add.s64 %rd118, %rd116, 8; | |
xor.b64 %rd8, %rd118, %rd117; | |
shr.u32 %r62, %r57, 30; | |
add.s32 %r63, %r56, %r62; | |
shr.s32 %r64, %r63, 2; | |
xor.b32 %r65, %r64, %r55; | |
cvt.s64.s32 %rd9, %r65; | |
mul.wide.u32 %rd119, %r49, 2; | |
and.b64 %rd10, %rd119, 192; | |
add.s64 %rd120, %rd10, %rd9; | |
and.b64 %rd121, %rd114, 3; | |
shl.b64 %rd122, %rd121, 2; | |
xor.b64 %rd11, %rd120, %rd122; | |
or.b64 %rd123, %rd121, 4; | |
shl.b64 %rd124, %rd123, 2; | |
xor.b64 %rd12, %rd120, %rd124; | |
or.b64 %rd125, %rd121, 8; | |
shl.b64 %rd126, %rd125, 2; | |
xor.b64 %rd13, %rd120, %rd126; | |
or.b64 %rd127, %rd121, 12; | |
shl.b64 %rd128, %rd127, 2; | |
xor.b64 %rd14, %rd120, %rd128; | |
add.s64 %rd129, %rd120, 8; | |
xor.b64 %rd15, %rd129, %rd122; | |
xor.b64 %rd16, %rd129, %rd124; | |
xor.b64 %rd17, %rd129, %rd126; | |
xor.b64 %rd18, %rd129, %rd128; | |
add.s64 %rd130, %rd120, 16; | |
xor.b64 %rd19, %rd130, %rd122; | |
xor.b64 %rd20, %rd130, %rd124; | |
xor.b64 %rd21, %rd130, %rd126; | |
xor.b64 %rd22, %rd130, %rd128; | |
add.s64 %rd131, %rd120, 24; | |
xor.b64 %rd23, %rd131, %rd122; | |
xor.b64 %rd24, %rd131, %rd124; | |
xor.b64 %rd25, %rd131, %rd126; | |
xor.b64 %rd26, %rd131, %rd128; | |
add.s64 %rd132, %rd120, 32; | |
xor.b64 %rd27, %rd132, %rd122; | |
xor.b64 %rd28, %rd132, %rd124; | |
xor.b64 %rd29, %rd132, %rd126; | |
xor.b64 %rd30, %rd132, %rd128; | |
add.s64 %rd133, %rd120, 40; | |
xor.b64 %rd31, %rd133, %rd122; | |
xor.b64 %rd32, %rd133, %rd124; | |
xor.b64 %rd33, %rd133, %rd126; | |
xor.b64 %rd34, %rd133, %rd128; | |
add.s64 %rd134, %rd120, 48; | |
xor.b64 %rd35, %rd134, %rd122; | |
xor.b64 %rd36, %rd134, %rd124; | |
xor.b64 %rd37, %rd134, %rd126; | |
xor.b64 %rd38, %rd134, %rd128; | |
add.s64 %rd135, %rd120, 56; | |
xor.b64 %rd39, %rd135, %rd122; | |
xor.b64 %rd40, %rd135, %rd124; | |
xor.b64 %rd41, %rd135, %rd126; | |
xor.b64 %rd42, %rd135, %rd128; | |
shl.b64 %rd43, %rd99, 4; | |
shl.b64 %rd136, %rd99, 6; | |
mov.u64 %rd137, __dynamic_shared_memory__; | |
shl.b64 %rd138, %rd106, 2; | |
add.s64 %rd60, %rd137, %rd138; | |
add.s64 %rd44, %rd60, %rd136; | |
shl.b64 %rd139, %rd100, 13; | |
add.s64 %rd140, %rd94, %rd139; | |
shl.b64 %rd141, %rd103, 2; | |
add.s64 %rd142, %rd140, %rd141; | |
cp.async.cg.shared.global [%rd44], [%rd142], 16; | |
add.s64 %rd143, %rd44, 4096; | |
or.b64 %rd144, %rd139, 524288; | |
add.s64 %rd145, %rd94, %rd144; | |
add.s64 %rd146, %rd145, %rd141; | |
cp.async.cg.shared.global [%rd143], [%rd146], 16; | |
shl.b64 %rd45, %rd4, 8; | |
add.s64 %rd147, %rd137, 32768; | |
shl.b64 %rd148, %rd4, 4; | |
shl.b64 %rd149, %rd2, 2; | |
xor.b64 %rd150, %rd148, %rd149; | |
shl.b64 %rd151, %rd4, 10; | |
add.s64 %rd61, %rd147, %rd150; | |
add.s64 %rd152, %rd61, %rd151; | |
shl.b64 %rd153, %rd4, 12; | |
shl.b64 %rd154, %rd3, 2; | |
add.s64 %rd62, %rd95, %rd154; | |
add.s64 %rd155, %rd62, %rd153; | |
cp.async.cg.shared.global [%rd152], [%rd155], 16; | |
shl.b64 %rd46, %rd111, 8; | |
shl.b64 %rd156, %rd111, 4; | |
xor.b64 %rd157, %rd156, %rd149; | |
shl.b64 %rd158, %rd111, 10; | |
add.s64 %rd63, %rd147, %rd157; | |
add.s64 %rd159, %rd63, %rd158; | |
shl.b64 %rd160, %rd111, 12; | |
add.s64 %rd161, %rd62, %rd160; | |
cp.async.cg.shared.global [%rd159], [%rd161], 16; | |
shl.b64 %rd47, %rd112, 8; | |
shl.b64 %rd162, %rd112, 4; | |
xor.b64 %rd163, %rd162, %rd149; | |
shl.b64 %rd164, %rd112, 10; | |
add.s64 %rd64, %rd147, %rd163; | |
add.s64 %rd165, %rd64, %rd164; | |
shl.b64 %rd166, %rd112, 12; | |
add.s64 %rd167, %rd62, %rd166; | |
cp.async.cg.shared.global [%rd165], [%rd167], 16; | |
shl.b64 %rd48, %rd113, 8; | |
shl.b64 %rd168, %rd113, 4; | |
xor.b64 %rd169, %rd168, %rd149; | |
shl.b64 %rd170, %rd113, 10; | |
add.s64 %rd65, %rd147, %rd169; | |
add.s64 %rd171, %rd65, %rd170; | |
shl.b64 %rd172, %rd113, 12; | |
add.s64 %rd173, %rd62, %rd172; | |
cp.async.cg.shared.global [%rd171], [%rd173], 16; | |
cp.async.commit_group; | |
add.s64 %rd174, %rd44, 8192; | |
add.s64 %rd175, %rd142, 64; | |
cp.async.cg.shared.global [%rd174], [%rd175], 16; | |
add.s64 %rd176, %rd44, 12288; | |
add.s64 %rd177, %rd146, 64; | |
cp.async.cg.shared.global [%rd176], [%rd177], 16; | |
or.b64 %rd49, %rd4, 16; | |
add.s64 %rd178, %rd152, 16384; | |
shl.b64 %rd179, %rd49, 12; | |
add.s64 %rd180, %rd62, %rd179; | |
cp.async.cg.shared.global [%rd178], [%rd180], 16; | |
or.b64 %rd50, %rd4, 20; | |
add.s64 %rd181, %rd159, 16384; | |
shl.b64 %rd182, %rd50, 12; | |
add.s64 %rd183, %rd62, %rd182; | |
cp.async.cg.shared.global [%rd181], [%rd183], 16; | |
or.b64 %rd51, %rd4, 24; | |
add.s64 %rd184, %rd165, 16384; | |
shl.b64 %rd185, %rd51, 12; | |
add.s64 %rd186, %rd62, %rd185; | |
cp.async.cg.shared.global [%rd184], [%rd186], 16; | |
or.b64 %rd52, %rd4, 28; | |
add.s64 %rd187, %rd171, 16384; | |
shl.b64 %rd188, %rd52, 12; | |
add.s64 %rd189, %rd62, %rd188; | |
cp.async.cg.shared.global [%rd187], [%rd189], 16; | |
cp.async.commit_group; | |
add.s64 %rd190, %rd44, 16384; | |
add.s64 %rd191, %rd142, 128; | |
cp.async.cg.shared.global [%rd190], [%rd191], 16; | |
add.s64 %rd192, %rd44, 20480; | |
add.s64 %rd193, %rd146, 128; | |
cp.async.cg.shared.global [%rd192], [%rd193], 16; | |
or.b64 %rd53, %rd4, 32; | |
add.s64 %rd194, %rd152, 32768; | |
shl.b64 %rd195, %rd53, 12; | |
add.s64 %rd196, %rd62, %rd195; | |
cp.async.cg.shared.global [%rd194], [%rd196], 16; | |
or.b64 %rd54, %rd4, 36; | |
add.s64 %rd197, %rd159, 32768; | |
shl.b64 %rd198, %rd54, 12; | |
add.s64 %rd199, %rd62, %rd198; | |
cp.async.cg.shared.global [%rd197], [%rd199], 16; | |
or.b64 %rd55, %rd4, 40; | |
add.s64 %rd200, %rd165, 32768; | |
shl.b64 %rd201, %rd55, 12; | |
add.s64 %rd202, %rd62, %rd201; | |
cp.async.cg.shared.global [%rd200], [%rd202], 16; | |
or.b64 %rd56, %rd4, 44; | |
add.s64 %rd203, %rd171, 32768; | |
shl.b64 %rd204, %rd56, 12; | |
add.s64 %rd205, %rd62, %rd204; | |
cp.async.cg.shared.global [%rd203], [%rd205], 16; | |
cp.async.commit_group; | |
cp.async.wait_group 2; | |
bar.sync 0; | |
mul.wide.s32 %rd206, %r61, 256; | |
shl.b64 %rd207, %rd115, 4; | |
sub.s64 %rd57, %rd207, %rd206; | |
shl.b64 %rd208, %rd57, 2; | |
add.s64 %rd209, %rd137, %rd208; | |
shl.b64 %rd210, %rd7, 2; | |
add.s64 %rd211, %rd209, %rd210; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd211]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd211+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd211+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd211+3072]; | |
shl.b64 %rd58, %rd121, 8; | |
shl.b64 %rd212, %rd121, 10; | |
add.s64 %rd213, %rd147, %rd212; | |
shl.b64 %rd214, %rd11, 2; | |
add.s64 %rd215, %rd213, %rd214; | |
ld.shared.f32 %f708, [%rd215]; | |
shl.b64 %rd59, %rd123, 8; | |
shl.b64 %rd216, %rd123, 10; | |
add.s64 %rd217, %rd147, %rd216; | |
shl.b64 %rd218, %rd12, 2; | |
add.s64 %rd219, %rd217, %rd218; | |
ld.shared.f32 %f707, [%rd219]; | |
shl.b64 %rd220, %rd15, 2; | |
add.s64 %rd221, %rd213, %rd220; | |
ld.shared.f32 %f706, [%rd221]; | |
shl.b64 %rd222, %rd16, 2; | |
add.s64 %rd223, %rd217, %rd222; | |
ld.shared.f32 %f705, [%rd223]; | |
shl.b64 %rd224, %rd19, 2; | |
add.s64 %rd225, %rd213, %rd224; | |
ld.shared.f32 %f704, [%rd225]; | |
shl.b64 %rd226, %rd20, 2; | |
add.s64 %rd227, %rd217, %rd226; | |
ld.shared.f32 %f703, [%rd227]; | |
shl.b64 %rd228, %rd23, 2; | |
add.s64 %rd229, %rd213, %rd228; | |
ld.shared.f32 %f702, [%rd229]; | |
shl.b64 %rd230, %rd24, 2; | |
add.s64 %rd231, %rd217, %rd230; | |
ld.shared.f32 %f701, [%rd231]; | |
shl.b64 %rd232, %rd27, 2; | |
add.s64 %rd233, %rd213, %rd232; | |
ld.shared.f32 %f700, [%rd233]; | |
shl.b64 %rd234, %rd28, 2; | |
add.s64 %rd235, %rd217, %rd234; | |
ld.shared.f32 %f699, [%rd235]; | |
shl.b64 %rd236, %rd31, 2; | |
add.s64 %rd237, %rd213, %rd236; | |
ld.shared.f32 %f698, [%rd237]; | |
shl.b64 %rd238, %rd32, 2; | |
add.s64 %rd239, %rd217, %rd238; | |
ld.shared.f32 %f697, [%rd239]; | |
shl.b64 %rd240, %rd35, 2; | |
add.s64 %rd241, %rd213, %rd240; | |
ld.shared.f32 %f696, [%rd241]; | |
shl.b64 %rd242, %rd36, 2; | |
add.s64 %rd243, %rd217, %rd242; | |
ld.shared.f32 %f695, [%rd243]; | |
shl.b64 %rd244, %rd39, 2; | |
add.s64 %rd245, %rd213, %rd244; | |
ld.shared.f32 %f694, [%rd245]; | |
shl.b64 %rd246, %rd40, 2; | |
add.s64 %rd247, %rd217, %rd246; | |
ld.shared.f32 %f693, [%rd247]; | |
shl.b64 %rd66, %rd125, 8; | |
shl.b64 %rd67, %rd127, 8; | |
mul.wide.u32 %rd248, %r49, 16; | |
add.s64 %rd249, %rd248, %rd144; | |
shl.b64 %rd250, %rd98, 6; | |
sub.s64 %rd251, %rd249, %rd250; | |
add.s64 %rd252, %rd251, %rd94; | |
add.s64 %rd538, %rd252, 192; | |
mul.wide.u32 %rd253, %r51, 1048576; | |
mul.wide.u32 %rd254, %r50, 262144; | |
add.s64 %rd255, %rd253, %rd254; | |
mul.lo.s64 %rd256, %rd98, 8128; | |
add.s64 %rd257, %rd255, %rd256; | |
add.s64 %rd258, %rd257, %rd248; | |
add.s64 %rd259, %rd258, %rd94; | |
add.s64 %rd537, %rd259, 192; | |
mul.wide.u32 %rd260, %r50, 8192; | |
mul.lo.s64 %rd261, %rd108, 3072; | |
add.s64 %rd262, %rd260, %rd261; | |
mul.wide.u32 %rd263, %r52, 1024; | |
add.s64 %rd264, %rd262, %rd263; | |
add.s64 %rd265, %rd264, %rd248; | |
add.s64 %rd266, %rd265, %rd95; | |
add.s64 %rd535, %rd266, 196608; | |
add.s64 %rd267, %rd108, %rd110; | |
shl.b64 %rd268, %rd267, 10; | |
add.s64 %rd534, %rd268, 49152; | |
mov.f32 %f709, 0f00000000; | |
mov.u64 %rd541, 0; | |
mov.u64 %rd540, 1; | |
mov.u64 %rd539, 2; | |
mov.u64 %rd536, -16; | |
mov.u64 %rd533, 3; | |
shl.b64 %rd280, %rd8, 2; | |
shl.b64 %rd287, %rd13, 2; | |
shl.b64 %rd292, %rd14, 2; | |
shl.b64 %rd294, %rd17, 2; | |
shl.b64 %rd296, %rd18, 2; | |
shl.b64 %rd298, %rd21, 2; | |
shl.b64 %rd300, %rd22, 2; | |
shl.b64 %rd302, %rd25, 2; | |
shl.b64 %rd304, %rd26, 2; | |
shl.b64 %rd306, %rd29, 2; | |
shl.b64 %rd308, %rd30, 2; | |
shl.b64 %rd310, %rd33, 2; | |
shl.b64 %rd312, %rd34, 2; | |
shl.b64 %rd314, %rd37, 2; | |
shl.b64 %rd316, %rd38, 2; | |
shl.b64 %rd318, %rd41, 2; | |
shl.b64 %rd320, %rd42, 2; | |
mov.f32 %f710, %f709; | |
mov.f32 %f711, %f709; | |
mov.f32 %f712, %f709; | |
mov.f32 %f713, %f709; | |
mov.f32 %f714, %f709; | |
mov.f32 %f715, %f709; | |
mov.f32 %f716, %f709; | |
mov.f32 %f717, %f709; | |
mov.f32 %f718, %f709; | |
mov.f32 %f719, %f709; | |
mov.f32 %f720, %f709; | |
mov.f32 %f721, %f709; | |
mov.f32 %f722, %f709; | |
mov.f32 %f723, %f709; | |
mov.f32 %f724, %f709; | |
mov.f32 %f725, %f709; | |
mov.f32 %f726, %f709; | |
mov.f32 %f727, %f709; | |
mov.f32 %f728, %f709; | |
mov.f32 %f729, %f709; | |
mov.f32 %f730, %f709; | |
mov.f32 %f731, %f709; | |
mov.f32 %f732, %f709; | |
mov.f32 %f733, %f709; | |
mov.f32 %f734, %f709; | |
mov.f32 %f735, %f709; | |
mov.f32 %f736, %f709; | |
mov.f32 %f737, %f709; | |
mov.f32 %f738, %f709; | |
mov.f32 %f739, %f709; | |
mov.f32 %f740, %f709; | |
mov.f32 %f741, %f709; | |
mov.f32 %f742, %f709; | |
mov.f32 %f743, %f709; | |
mov.f32 %f744, %f709; | |
mov.f32 %f745, %f709; | |
mov.f32 %f746, %f709; | |
mov.f32 %f747, %f709; | |
mov.f32 %f748, %f709; | |
mov.f32 %f749, %f709; | |
mov.f32 %f750, %f709; | |
mov.f32 %f751, %f709; | |
mov.f32 %f752, %f709; | |
mov.f32 %f753, %f709; | |
mov.f32 %f754, %f709; | |
mov.f32 %f755, %f709; | |
mov.f32 %f756, %f709; | |
mov.f32 %f757, %f709; | |
mov.f32 %f758, %f709; | |
mov.f32 %f759, %f709; | |
mov.f32 %f760, %f709; | |
mov.f32 %f761, %f709; | |
mov.f32 %f762, %f709; | |
mov.f32 %f763, %f709; | |
mov.f32 %f764, %f709; | |
mov.f32 %f765, %f709; | |
mov.f32 %f766, %f709; | |
mov.f32 %f767, %f709; | |
mov.f32 %f768, %f709; | |
mov.f32 %f769, %f709; | |
mov.f32 %f770, %f709; | |
mov.f32 %f771, %f709; | |
mov.f32 %f772, %f709; | |
mov.f32 %f773, %f709; | |
mov.f32 %f774, %f709; | |
mov.f32 %f775, %f709; | |
mov.f32 %f776, %f709; | |
mov.f32 %f777, %f709; | |
mov.f32 %f778, %f709; | |
mov.f32 %f779, %f709; | |
mov.f32 %f780, %f709; | |
mov.f32 %f781, %f709; | |
mov.f32 %f782, %f709; | |
mov.f32 %f783, %f709; | |
mov.f32 %f784, %f709; | |
mov.f32 %f785, %f709; | |
mov.f32 %f786, %f709; | |
mov.f32 %f787, %f709; | |
mov.f32 %f788, %f709; | |
mov.f32 %f789, %f709; | |
mov.f32 %f790, %f709; | |
mov.f32 %f791, %f709; | |
mov.f32 %f792, %f709; | |
mov.f32 %f793, %f709; | |
mov.f32 %f794, %f709; | |
mov.f32 %f795, %f709; | |
mov.f32 %f796, %f709; | |
mov.f32 %f797, %f709; | |
mov.f32 %f798, %f709; | |
mov.f32 %f799, %f709; | |
mov.f32 %f800, %f709; | |
mov.f32 %f801, %f709; | |
mov.f32 %f802, %f709; | |
mov.f32 %f803, %f709; | |
mov.f32 %f804, %f709; | |
mov.f32 %f805, %f709; | |
mov.f32 %f806, %f709; | |
mov.f32 %f807, %f709; | |
mov.f32 %f808, %f709; | |
mov.f32 %f809, %f709; | |
mov.f32 %f810, %f709; | |
mov.f32 %f811, %f709; | |
mov.f32 %f812, %f709; | |
mov.f32 %f813, %f709; | |
mov.f32 %f814, %f709; | |
mov.f32 %f815, %f709; | |
mov.f32 %f816, %f709; | |
mov.f32 %f817, %f709; | |
mov.f32 %f818, %f709; | |
mov.f32 %f819, %f709; | |
mov.f32 %f820, %f709; | |
mov.f32 %f821, %f709; | |
mov.f32 %f822, %f709; | |
mov.f32 %f823, %f709; | |
mov.f32 %f824, %f709; | |
mov.f32 %f825, %f709; | |
mov.f32 %f826, %f709; | |
mov.f32 %f827, %f709; | |
mov.f32 %f828, %f709; | |
mov.f32 %f829, %f709; | |
mov.f32 %f830, %f709; | |
mov.f32 %f831, %f709; | |
mov.f32 %f832, %f709; | |
mov.f32 %f833, %f709; | |
mov.f32 %f834, %f709; | |
mov.f32 %f835, %f709; | |
mov.f32 %f836, %f709; | |
$L__BB0_1: | |
mov.u64 %rd79, %rd540; | |
mov.u64 %rd540, %rd539; | |
add.s64 %rd536, %rd536, 16; | |
setp.lt.u64 %p1, %rd536, 2000; | |
shl.b64 %rd275, %rd541, 13; | |
add.s64 %rd277, %rd137, %rd275; | |
add.s64 %rd279, %rd277, %rd208; | |
add.s64 %rd281, %rd279, %rd280; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r78, %r79, %r80, %r81}, [%rd281]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r82, %r83, %r84, %r85}, [%rd281+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r86, %r87, %r88, %r89}, [%rd281+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r90, %r91, %r92, %r93}, [%rd281+3072]; | |
shl.b64 %rd282, %rd541, 12; | |
or.b64 %rd283, %rd282, %rd66; | |
shl.b64 %rd284, %rd283, 2; | |
add.s64 %rd286, %rd147, %rd284; | |
add.s64 %rd288, %rd286, %rd287; | |
ld.shared.u32 %r94, [%rd288]; | |
or.b64 %rd289, %rd282, %rd67; | |
shl.b64 %rd290, %rd289, 2; | |
add.s64 %rd291, %rd147, %rd290; | |
add.s64 %rd293, %rd291, %rd292; | |
ld.shared.u32 %r95, [%rd293]; | |
add.s64 %rd295, %rd286, %rd294; | |
ld.shared.u32 %r96, [%rd295]; | |
add.s64 %rd297, %rd291, %rd296; | |
ld.shared.u32 %r97, [%rd297]; | |
add.s64 %rd299, %rd286, %rd298; | |
ld.shared.u32 %r98, [%rd299]; | |
add.s64 %rd301, %rd291, %rd300; | |
ld.shared.u32 %r99, [%rd301]; | |
add.s64 %rd303, %rd286, %rd302; | |
ld.shared.u32 %r100, [%rd303]; | |
add.s64 %rd305, %rd291, %rd304; | |
ld.shared.u32 %r101, [%rd305]; | |
add.s64 %rd307, %rd286, %rd306; | |
ld.shared.u32 %r102, [%rd307]; | |
add.s64 %rd309, %rd291, %rd308; | |
ld.shared.u32 %r103, [%rd309]; | |
add.s64 %rd311, %rd286, %rd310; | |
ld.shared.u32 %r104, [%rd311]; | |
add.s64 %rd313, %rd291, %rd312; | |
ld.shared.u32 %r105, [%rd313]; | |
add.s64 %rd315, %rd286, %rd314; | |
ld.shared.u32 %r106, [%rd315]; | |
add.s64 %rd317, %rd291, %rd316; | |
ld.shared.u32 %r107, [%rd317]; | |
add.s64 %rd319, %rd286, %rd318; | |
ld.shared.u32 %r108, [%rd319]; | |
add.s64 %rd321, %rd291, %rd320; | |
ld.shared.u32 %r109, [%rd321]; | |
mov.b32 %r110, %f708; | |
mov.b32 %r111, %f707; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f437, %f438, %f439, %f440}, | |
{%r139, %r140, %r141, %r142}, | |
{%r110, %r111}, | |
{%f833, %f834, %f835, %f836}; | |
mov.b32 %r112, %f706; | |
mov.b32 %r113, %f705; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f441, %f442, %f443, %f444}, | |
{%r139, %r140, %r141, %r142}, | |
{%r112, %r113}, | |
{%f829, %f830, %f831, %f832}; | |
mov.b32 %r114, %f704; | |
mov.b32 %r115, %f703; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f445, %f446, %f447, %f448}, | |
{%r139, %r140, %r141, %r142}, | |
{%r114, %r115}, | |
{%f825, %f826, %f827, %f828}; | |
mov.b32 %r116, %f702; | |
mov.b32 %r117, %f701; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f449, %f450, %f451, %f452}, | |
{%r139, %r140, %r141, %r142}, | |
{%r116, %r117}, | |
{%f821, %f822, %f823, %f824}; | |
mov.b32 %r118, %f700; | |
mov.b32 %r119, %f699; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f453, %f454, %f455, %f456}, | |
{%r139, %r140, %r141, %r142}, | |
{%r118, %r119}, | |
{%f817, %f818, %f819, %f820}; | |
mov.b32 %r120, %f698; | |
mov.b32 %r121, %f697; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f457, %f458, %f459, %f460}, | |
{%r139, %r140, %r141, %r142}, | |
{%r120, %r121}, | |
{%f813, %f814, %f815, %f816}; | |
mov.b32 %r122, %f696; | |
mov.b32 %r123, %f695; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f461, %f462, %f463, %f464}, | |
{%r139, %r140, %r141, %r142}, | |
{%r122, %r123}, | |
{%f809, %f810, %f811, %f812}; | |
mov.b32 %r124, %f694; | |
mov.b32 %r125, %f693; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f465, %f466, %f467, %f468}, | |
{%r139, %r140, %r141, %r142}, | |
{%r124, %r125}, | |
{%f805, %f806, %f807, %f808}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f469, %f470, %f471, %f472}, | |
{%r135, %r136, %r137, %r138}, | |
{%r110, %r111}, | |
{%f801, %f802, %f803, %f804}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f473, %f474, %f475, %f476}, | |
{%r135, %r136, %r137, %r138}, | |
{%r112, %r113}, | |
{%f797, %f798, %f799, %f800}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f477, %f478, %f479, %f480}, | |
{%r135, %r136, %r137, %r138}, | |
{%r114, %r115}, | |
{%f793, %f794, %f795, %f796}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f481, %f482, %f483, %f484}, | |
{%r135, %r136, %r137, %r138}, | |
{%r116, %r117}, | |
{%f789, %f790, %f791, %f792}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f485, %f486, %f487, %f488}, | |
{%r135, %r136, %r137, %r138}, | |
{%r118, %r119}, | |
{%f785, %f786, %f787, %f788}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f489, %f490, %f491, %f492}, | |
{%r135, %r136, %r137, %r138}, | |
{%r120, %r121}, | |
{%f781, %f782, %f783, %f784}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f493, %f494, %f495, %f496}, | |
{%r135, %r136, %r137, %r138}, | |
{%r122, %r123}, | |
{%f777, %f778, %f779, %f780}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f497, %f498, %f499, %f500}, | |
{%r135, %r136, %r137, %r138}, | |
{%r124, %r125}, | |
{%f773, %f774, %f775, %f776}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f501, %f502, %f503, %f504}, | |
{%r131, %r132, %r133, %r134}, | |
{%r110, %r111}, | |
{%f769, %f770, %f771, %f772}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f505, %f506, %f507, %f508}, | |
{%r131, %r132, %r133, %r134}, | |
{%r112, %r113}, | |
{%f765, %f766, %f767, %f768}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f509, %f510, %f511, %f512}, | |
{%r131, %r132, %r133, %r134}, | |
{%r114, %r115}, | |
{%f761, %f762, %f763, %f764}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f513, %f514, %f515, %f516}, | |
{%r131, %r132, %r133, %r134}, | |
{%r116, %r117}, | |
{%f757, %f758, %f759, %f760}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f517, %f518, %f519, %f520}, | |
{%r131, %r132, %r133, %r134}, | |
{%r118, %r119}, | |
{%f753, %f754, %f755, %f756}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f521, %f522, %f523, %f524}, | |
{%r131, %r132, %r133, %r134}, | |
{%r120, %r121}, | |
{%f749, %f750, %f751, %f752}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f525, %f526, %f527, %f528}, | |
{%r131, %r132, %r133, %r134}, | |
{%r122, %r123}, | |
{%f745, %f746, %f747, %f748}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f529, %f530, %f531, %f532}, | |
{%r131, %r132, %r133, %r134}, | |
{%r124, %r125}, | |
{%f741, %f742, %f743, %f744}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f533, %f534, %f535, %f536}, | |
{%r127, %r128, %r129, %r130}, | |
{%r110, %r111}, | |
{%f737, %f738, %f739, %f740}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f537, %f538, %f539, %f540}, | |
{%r127, %r128, %r129, %r130}, | |
{%r112, %r113}, | |
{%f733, %f734, %f735, %f736}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f541, %f542, %f543, %f544}, | |
{%r127, %r128, %r129, %r130}, | |
{%r114, %r115}, | |
{%f729, %f730, %f731, %f732}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f545, %f546, %f547, %f548}, | |
{%r127, %r128, %r129, %r130}, | |
{%r116, %r117}, | |
{%f725, %f726, %f727, %f728}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f549, %f550, %f551, %f552}, | |
{%r127, %r128, %r129, %r130}, | |
{%r118, %r119}, | |
{%f721, %f722, %f723, %f724}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f553, %f554, %f555, %f556}, | |
{%r127, %r128, %r129, %r130}, | |
{%r120, %r121}, | |
{%f717, %f718, %f719, %f720}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f557, %f558, %f559, %f560}, | |
{%r127, %r128, %r129, %r130}, | |
{%r122, %r123}, | |
{%f713, %f714, %f715, %f716}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f561, %f562, %f563, %f564}, | |
{%r127, %r128, %r129, %r130}, | |
{%r124, %r125}, | |
{%f709, %f710, %f711, %f712}; | |
and.b64 %rd539, %rd533, 3; | |
shl.b64 %rd322, %rd539, 11; | |
or.b64 %rd323, %rd322, %rd43; | |
shl.b64 %rd324, %rd323, 2; | |
add.s64 %rd325, %rd60, %rd324; | |
selp.b32 %r67, 16, 0, %p1; | |
cvt.u32.u64 %r66, %rd325; | |
// begin inline asm | |
cp.async.cg.shared.global [%r66], [%rd537], 16, %r67; | |
// end inline asm | |
shl.b64 %rd326, %rd539, 13; | |
add.s64 %rd327, %rd44, %rd326; | |
cvt.u32.u64 %r126, %rd327; | |
add.s32 %r68, %r126, 4096; | |
// begin inline asm | |
cp.async.cg.shared.global [%r68], [%rd538], 16, %r67; | |
// end inline asm | |
shl.b64 %rd328, %rd539, 12; | |
or.b64 %rd329, %rd328, %rd45; | |
shl.b64 %rd330, %rd329, 2; | |
add.s64 %rd331, %rd61, %rd330; | |
cvt.u32.u64 %r70, %rd331; | |
// begin inline asm | |
cp.async.cg.shared.global [%r70], [%rd535], 16, %r67; | |
// end inline asm | |
or.b64 %rd332, %rd328, %rd46; | |
shl.b64 %rd333, %rd332, 2; | |
add.s64 %rd334, %rd63, %rd333; | |
shl.b64 %rd335, %rd534, 2; | |
or.b64 %rd336, %rd335, 16384; | |
add.s64 %rd272, %rd62, %rd336; | |
cvt.u32.u64 %r72, %rd334; | |
// begin inline asm | |
cp.async.cg.shared.global [%r72], [%rd272], 16, %r67; | |
// end inline asm | |
or.b64 %rd337, %rd328, %rd47; | |
shl.b64 %rd338, %rd337, 2; | |
add.s64 %rd339, %rd64, %rd338; | |
or.b64 %rd340, %rd335, 32768; | |
add.s64 %rd273, %rd62, %rd340; | |
cvt.u32.u64 %r74, %rd339; | |
// begin inline asm | |
cp.async.cg.shared.global [%r74], [%rd273], 16, %r67; | |
// end inline asm | |
or.b64 %rd341, %rd328, %rd48; | |
shl.b64 %rd342, %rd341, 2; | |
add.s64 %rd343, %rd65, %rd342; | |
or.b64 %rd344, %rd335, 49152; | |
add.s64 %rd274, %rd62, %rd344; | |
cvt.u32.u64 %r76, %rd343; | |
// begin inline asm | |
cp.async.cg.shared.global [%r76], [%rd274], 16, %r67; | |
// end inline asm | |
cp.async.commit_group; | |
cp.async.wait_group 2; | |
bar.sync 0; | |
shl.b64 %rd345, %rd79, 13; | |
add.s64 %rd346, %rd137, %rd345; | |
add.s64 %rd347, %rd346, %rd208; | |
add.s64 %rd349, %rd347, %rd210; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd349]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd349+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd349+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd349+3072]; | |
shl.b64 %rd350, %rd79, 12; | |
or.b64 %rd351, %rd350, %rd58; | |
shl.b64 %rd352, %rd351, 2; | |
add.s64 %rd353, %rd147, %rd352; | |
add.s64 %rd355, %rd353, %rd214; | |
ld.shared.f32 %f708, [%rd355]; | |
or.b64 %rd356, %rd350, %rd59; | |
shl.b64 %rd357, %rd356, 2; | |
add.s64 %rd358, %rd147, %rd357; | |
add.s64 %rd360, %rd358, %rd218; | |
ld.shared.f32 %f707, [%rd360]; | |
add.s64 %rd362, %rd353, %rd220; | |
ld.shared.f32 %f706, [%rd362]; | |
add.s64 %rd364, %rd358, %rd222; | |
ld.shared.f32 %f705, [%rd364]; | |
add.s64 %rd366, %rd353, %rd224; | |
ld.shared.f32 %f704, [%rd366]; | |
add.s64 %rd368, %rd358, %rd226; | |
ld.shared.f32 %f703, [%rd368]; | |
add.s64 %rd370, %rd353, %rd228; | |
ld.shared.f32 %f702, [%rd370]; | |
add.s64 %rd372, %rd358, %rd230; | |
ld.shared.f32 %f701, [%rd372]; | |
add.s64 %rd374, %rd353, %rd232; | |
ld.shared.f32 %f700, [%rd374]; | |
add.s64 %rd376, %rd358, %rd234; | |
ld.shared.f32 %f699, [%rd376]; | |
add.s64 %rd378, %rd353, %rd236; | |
ld.shared.f32 %f698, [%rd378]; | |
add.s64 %rd380, %rd358, %rd238; | |
ld.shared.f32 %f697, [%rd380]; | |
add.s64 %rd382, %rd353, %rd240; | |
ld.shared.f32 %f696, [%rd382]; | |
add.s64 %rd384, %rd358, %rd242; | |
ld.shared.f32 %f695, [%rd384]; | |
add.s64 %rd386, %rd353, %rd244; | |
ld.shared.f32 %f694, [%rd386]; | |
add.s64 %rd388, %rd358, %rd246; | |
ld.shared.f32 %f693, [%rd388]; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f833, %f834, %f835, %f836}, | |
{%r78, %r79, %r80, %r81}, | |
{%r94, %r95}, | |
{%f437, %f438, %f439, %f440}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f829, %f830, %f831, %f832}, | |
{%r78, %r79, %r80, %r81}, | |
{%r96, %r97}, | |
{%f441, %f442, %f443, %f444}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f825, %f826, %f827, %f828}, | |
{%r78, %r79, %r80, %r81}, | |
{%r98, %r99}, | |
{%f445, %f446, %f447, %f448}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f821, %f822, %f823, %f824}, | |
{%r78, %r79, %r80, %r81}, | |
{%r100, %r101}, | |
{%f449, %f450, %f451, %f452}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f817, %f818, %f819, %f820}, | |
{%r78, %r79, %r80, %r81}, | |
{%r102, %r103}, | |
{%f453, %f454, %f455, %f456}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f813, %f814, %f815, %f816}, | |
{%r78, %r79, %r80, %r81}, | |
{%r104, %r105}, | |
{%f457, %f458, %f459, %f460}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f809, %f810, %f811, %f812}, | |
{%r78, %r79, %r80, %r81}, | |
{%r106, %r107}, | |
{%f461, %f462, %f463, %f464}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f805, %f806, %f807, %f808}, | |
{%r78, %r79, %r80, %r81}, | |
{%r108, %r109}, | |
{%f465, %f466, %f467, %f468}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f801, %f802, %f803, %f804}, | |
{%r82, %r83, %r84, %r85}, | |
{%r94, %r95}, | |
{%f469, %f470, %f471, %f472}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f797, %f798, %f799, %f800}, | |
{%r82, %r83, %r84, %r85}, | |
{%r96, %r97}, | |
{%f473, %f474, %f475, %f476}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f793, %f794, %f795, %f796}, | |
{%r82, %r83, %r84, %r85}, | |
{%r98, %r99}, | |
{%f477, %f478, %f479, %f480}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f789, %f790, %f791, %f792}, | |
{%r82, %r83, %r84, %r85}, | |
{%r100, %r101}, | |
{%f481, %f482, %f483, %f484}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f785, %f786, %f787, %f788}, | |
{%r82, %r83, %r84, %r85}, | |
{%r102, %r103}, | |
{%f485, %f486, %f487, %f488}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f781, %f782, %f783, %f784}, | |
{%r82, %r83, %r84, %r85}, | |
{%r104, %r105}, | |
{%f489, %f490, %f491, %f492}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f777, %f778, %f779, %f780}, | |
{%r82, %r83, %r84, %r85}, | |
{%r106, %r107}, | |
{%f493, %f494, %f495, %f496}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f773, %f774, %f775, %f776}, | |
{%r82, %r83, %r84, %r85}, | |
{%r108, %r109}, | |
{%f497, %f498, %f499, %f500}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f769, %f770, %f771, %f772}, | |
{%r86, %r87, %r88, %r89}, | |
{%r94, %r95}, | |
{%f501, %f502, %f503, %f504}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f765, %f766, %f767, %f768}, | |
{%r86, %r87, %r88, %r89}, | |
{%r96, %r97}, | |
{%f505, %f506, %f507, %f508}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f761, %f762, %f763, %f764}, | |
{%r86, %r87, %r88, %r89}, | |
{%r98, %r99}, | |
{%f509, %f510, %f511, %f512}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f757, %f758, %f759, %f760}, | |
{%r86, %r87, %r88, %r89}, | |
{%r100, %r101}, | |
{%f513, %f514, %f515, %f516}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f753, %f754, %f755, %f756}, | |
{%r86, %r87, %r88, %r89}, | |
{%r102, %r103}, | |
{%f517, %f518, %f519, %f520}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f749, %f750, %f751, %f752}, | |
{%r86, %r87, %r88, %r89}, | |
{%r104, %r105}, | |
{%f521, %f522, %f523, %f524}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f745, %f746, %f747, %f748}, | |
{%r86, %r87, %r88, %r89}, | |
{%r106, %r107}, | |
{%f525, %f526, %f527, %f528}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f741, %f742, %f743, %f744}, | |
{%r86, %r87, %r88, %r89}, | |
{%r108, %r109}, | |
{%f529, %f530, %f531, %f532}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f737, %f738, %f739, %f740}, | |
{%r90, %r91, %r92, %r93}, | |
{%r94, %r95}, | |
{%f533, %f534, %f535, %f536}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f733, %f734, %f735, %f736}, | |
{%r90, %r91, %r92, %r93}, | |
{%r96, %r97}, | |
{%f537, %f538, %f539, %f540}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f729, %f730, %f731, %f732}, | |
{%r90, %r91, %r92, %r93}, | |
{%r98, %r99}, | |
{%f541, %f542, %f543, %f544}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f725, %f726, %f727, %f728}, | |
{%r90, %r91, %r92, %r93}, | |
{%r100, %r101}, | |
{%f545, %f546, %f547, %f548}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f721, %f722, %f723, %f724}, | |
{%r90, %r91, %r92, %r93}, | |
{%r102, %r103}, | |
{%f549, %f550, %f551, %f552}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f717, %f718, %f719, %f720}, | |
{%r90, %r91, %r92, %r93}, | |
{%r104, %r105}, | |
{%f553, %f554, %f555, %f556}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f713, %f714, %f715, %f716}, | |
{%r90, %r91, %r92, %r93}, | |
{%r106, %r107}, | |
{%f557, %f558, %f559, %f560}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f709, %f710, %f711, %f712}, | |
{%r90, %r91, %r92, %r93}, | |
{%r108, %r109}, | |
{%f561, %f562, %f563, %f564}; | |
add.s64 %rd538, %rd538, 64; | |
add.s64 %rd537, %rd537, 64; | |
add.s64 %rd535, %rd535, 65536; | |
add.s64 %rd534, %rd534, 16384; | |
add.s64 %rd533, %rd533, 1; | |
setp.lt.u64 %p2, %rd536, 2032; | |
mov.u64 %rd541, %rd79; | |
@%p2 bra $L__BB0_1; | |
add.s64 %rd389, %rd5, %rd9; | |
shl.b64 %rd390, %rd9, 3; | |
sub.s64 %rd391, %rd6, %rd390; | |
add.s64 %rd392, %rd391, %rd10; | |
cp.async.commit_group; | |
cp.async.wait_group 0; | |
bar.sync 0; | |
shl.b64 %rd393, %rd389, 10; | |
add.s64 %rd395, %rd137, %rd393; | |
shl.b64 %rd396, %rd392, 2; | |
add.s64 %rd397, %rd395, %rd396; | |
st.shared.v2.f32 [%rd397+49376], {%f709, %f710}; | |
st.shared.v2.f32 [%rd397+57568], {%f711, %f712}; | |
st.shared.v2.f32 [%rd397+49344], {%f713, %f714}; | |
st.shared.v2.f32 [%rd397+57536], {%f715, %f716}; | |
st.shared.v2.f32 [%rd397+49312], {%f717, %f718}; | |
st.shared.v2.f32 [%rd397+57504], {%f719, %f720}; | |
st.shared.v2.f32 [%rd397+49280], {%f721, %f722}; | |
st.shared.v2.f32 [%rd397+57472], {%f723, %f724}; | |
st.shared.v2.f32 [%rd397+49248], {%f725, %f726}; | |
st.shared.v2.f32 [%rd397+57440], {%f727, %f728}; | |
st.shared.v2.f32 [%rd397+49216], {%f729, %f730}; | |
st.shared.v2.f32 [%rd397+57408], {%f731, %f732}; | |
st.shared.v2.f32 [%rd397+49184], {%f733, %f734}; | |
st.shared.v2.f32 [%rd397+57376], {%f735, %f736}; | |
st.shared.v2.f32 [%rd397+49152], {%f737, %f738}; | |
st.shared.v2.f32 [%rd397+57344], {%f739, %f740}; | |
st.shared.v2.f32 [%rd397+32992], {%f741, %f742}; | |
st.shared.v2.f32 [%rd397+41184], {%f743, %f744}; | |
st.shared.v2.f32 [%rd397+32960], {%f745, %f746}; | |
st.shared.v2.f32 [%rd397+41152], {%f747, %f748}; | |
st.shared.v2.f32 [%rd397+32928], {%f749, %f750}; | |
st.shared.v2.f32 [%rd397+41120], {%f751, %f752}; | |
st.shared.v2.f32 [%rd397+32896], {%f753, %f754}; | |
st.shared.v2.f32 [%rd397+41088], {%f755, %f756}; | |
st.shared.v2.f32 [%rd397+32864], {%f757, %f758}; | |
st.shared.v2.f32 [%rd397+41056], {%f759, %f760}; | |
st.shared.v2.f32 [%rd397+32832], {%f761, %f762}; | |
st.shared.v2.f32 [%rd397+41024], {%f763, %f764}; | |
st.shared.v2.f32 [%rd397+32800], {%f765, %f766}; | |
st.shared.v2.f32 [%rd397+40992], {%f767, %f768}; | |
st.shared.v2.f32 [%rd397+32768], {%f769, %f770}; | |
st.shared.v2.f32 [%rd397+40960], {%f771, %f772}; | |
st.shared.v2.f32 [%rd397+16608], {%f773, %f774}; | |
st.shared.v2.f32 [%rd397+24800], {%f775, %f776}; | |
st.shared.v2.f32 [%rd397+16576], {%f777, %f778}; | |
st.shared.v2.f32 [%rd397+24768], {%f779, %f780}; | |
st.shared.v2.f32 [%rd397+16544], {%f781, %f782}; | |
st.shared.v2.f32 [%rd397+24736], {%f783, %f784}; | |
st.shared.v2.f32 [%rd397+16512], {%f785, %f786}; | |
st.shared.v2.f32 [%rd397+24704], {%f787, %f788}; | |
st.shared.v2.f32 [%rd397+16480], {%f789, %f790}; | |
st.shared.v2.f32 [%rd397+24672], {%f791, %f792}; | |
st.shared.v2.f32 [%rd397+16448], {%f793, %f794}; | |
st.shared.v2.f32 [%rd397+24640], {%f795, %f796}; | |
st.shared.v2.f32 [%rd397+16416], {%f797, %f798}; | |
st.shared.v2.f32 [%rd397+24608], {%f799, %f800}; | |
st.shared.v2.f32 [%rd397+16384], {%f801, %f802}; | |
st.shared.v2.f32 [%rd397+24576], {%f803, %f804}; | |
st.shared.v2.f32 [%rd397+224], {%f805, %f806}; | |
st.shared.v2.f32 [%rd397+8416], {%f807, %f808}; | |
st.shared.v2.f32 [%rd397+192], {%f809, %f810}; | |
st.shared.v2.f32 [%rd397+8384], {%f811, %f812}; | |
st.shared.v2.f32 [%rd397+160], {%f813, %f814}; | |
st.shared.v2.f32 [%rd397+8352], {%f815, %f816}; | |
st.shared.v2.f32 [%rd397+128], {%f817, %f818}; | |
st.shared.v2.f32 [%rd397+8320], {%f819, %f820}; | |
st.shared.v2.f32 [%rd397+96], {%f821, %f822}; | |
st.shared.v2.f32 [%rd397+8288], {%f823, %f824}; | |
st.shared.v2.f32 [%rd397+64], {%f825, %f826}; | |
st.shared.v2.f32 [%rd397+8256], {%f827, %f828}; | |
st.shared.v2.f32 [%rd397+32], {%f829, %f830}; | |
st.shared.v2.f32 [%rd397+8224], {%f831, %f832}; | |
st.shared.v2.f32 [%rd397], {%f833, %f834}; | |
st.shared.v2.f32 [%rd397+8192], {%f835, %f836}; | |
bar.sync 0; | |
shl.b64 %rd398, %rd45, 2; | |
add.s64 %rd399, %rd137, %rd398; | |
add.s64 %rd401, %rd399, %rd149; | |
ld.shared.v4.f32 {%f565, %f566, %f567, %f568}, [%rd401]; | |
or.b64 %rd402, %rd4, %rd1; | |
shl.b64 %rd403, %rd402, 12; | |
add.s64 %rd404, %rd88, %rd403; | |
add.s64 %rd406, %rd404, %rd154; | |
st.global.v4.f32 [%rd406], {%f565, %f566, %f567, %f568}; | |
shl.b64 %rd407, %rd46, 2; | |
add.s64 %rd408, %rd137, %rd407; | |
add.s64 %rd409, %rd408, %rd149; | |
ld.shared.v4.f32 {%f569, %f570, %f571, %f572}, [%rd409]; | |
or.b64 %rd410, %rd403, 16384; | |
add.s64 %rd411, %rd88, %rd410; | |
add.s64 %rd412, %rd411, %rd154; | |
st.global.v4.f32 [%rd412], {%f569, %f570, %f571, %f572}; | |
shl.b64 %rd413, %rd47, 2; | |
add.s64 %rd414, %rd137, %rd413; | |
add.s64 %rd415, %rd414, %rd149; | |
ld.shared.v4.f32 {%f573, %f574, %f575, %f576}, [%rd415]; | |
or.b64 %rd416, %rd403, 32768; | |
add.s64 %rd417, %rd88, %rd416; | |
add.s64 %rd418, %rd417, %rd154; | |
st.global.v4.f32 [%rd418], {%f573, %f574, %f575, %f576}; | |
shl.b64 %rd419, %rd48, 2; | |
add.s64 %rd420, %rd137, %rd419; | |
add.s64 %rd421, %rd420, %rd149; | |
ld.shared.v4.f32 {%f577, %f578, %f579, %f580}, [%rd421]; | |
or.b64 %rd422, %rd403, 49152; | |
add.s64 %rd423, %rd88, %rd422; | |
add.s64 %rd424, %rd423, %rd154; | |
st.global.v4.f32 [%rd424], {%f577, %f578, %f579, %f580}; | |
shl.b64 %rd425, %rd49, 10; | |
add.s64 %rd426, %rd137, %rd425; | |
add.s64 %rd427, %rd426, %rd149; | |
ld.shared.v4.f32 {%f581, %f582, %f583, %f584}, [%rd427]; | |
or.b64 %rd428, %rd403, 65536; | |
add.s64 %rd429, %rd88, %rd428; | |
add.s64 %rd430, %rd429, %rd154; | |
st.global.v4.f32 [%rd430], {%f581, %f582, %f583, %f584}; | |
shl.b64 %rd431, %rd50, 10; | |
add.s64 %rd432, %rd137, %rd431; | |
add.s64 %rd433, %rd432, %rd149; | |
ld.shared.v4.f32 {%f585, %f586, %f587, %f588}, [%rd433]; | |
or.b64 %rd434, %rd403, 81920; | |
add.s64 %rd435, %rd88, %rd434; | |
add.s64 %rd436, %rd435, %rd154; | |
st.global.v4.f32 [%rd436], {%f585, %f586, %f587, %f588}; | |
shl.b64 %rd437, %rd51, 10; | |
add.s64 %rd438, %rd137, %rd437; | |
add.s64 %rd439, %rd438, %rd149; | |
ld.shared.v4.f32 {%f589, %f590, %f591, %f592}, [%rd439]; | |
or.b64 %rd440, %rd403, 98304; | |
add.s64 %rd441, %rd88, %rd440; | |
add.s64 %rd442, %rd441, %rd154; | |
st.global.v4.f32 [%rd442], {%f589, %f590, %f591, %f592}; | |
shl.b64 %rd443, %rd52, 10; | |
add.s64 %rd444, %rd137, %rd443; | |
add.s64 %rd445, %rd444, %rd149; | |
ld.shared.v4.f32 {%f593, %f594, %f595, %f596}, [%rd445]; | |
or.b64 %rd446, %rd403, 114688; | |
add.s64 %rd447, %rd88, %rd446; | |
add.s64 %rd448, %rd447, %rd154; | |
st.global.v4.f32 [%rd448], {%f593, %f594, %f595, %f596}; | |
shl.b64 %rd449, %rd53, 10; | |
add.s64 %rd450, %rd137, %rd449; | |
add.s64 %rd451, %rd450, %rd149; | |
ld.shared.v4.f32 {%f597, %f598, %f599, %f600}, [%rd451]; | |
or.b64 %rd452, %rd403, 131072; | |
add.s64 %rd453, %rd88, %rd452; | |
add.s64 %rd454, %rd453, %rd154; | |
st.global.v4.f32 [%rd454], {%f597, %f598, %f599, %f600}; | |
shl.b64 %rd455, %rd54, 10; | |
add.s64 %rd456, %rd137, %rd455; | |
add.s64 %rd457, %rd456, %rd149; | |
ld.shared.v4.f32 {%f601, %f602, %f603, %f604}, [%rd457]; | |
or.b64 %rd458, %rd403, 147456; | |
add.s64 %rd459, %rd88, %rd458; | |
add.s64 %rd460, %rd459, %rd154; | |
st.global.v4.f32 [%rd460], {%f601, %f602, %f603, %f604}; | |
shl.b64 %rd461, %rd55, 10; | |
add.s64 %rd462, %rd137, %rd461; | |
add.s64 %rd463, %rd462, %rd149; | |
ld.shared.v4.f32 {%f605, %f606, %f607, %f608}, [%rd463]; | |
or.b64 %rd464, %rd403, 163840; | |
add.s64 %rd465, %rd88, %rd464; | |
add.s64 %rd466, %rd465, %rd154; | |
st.global.v4.f32 [%rd466], {%f605, %f606, %f607, %f608}; | |
shl.b64 %rd467, %rd56, 10; | |
add.s64 %rd468, %rd137, %rd467; | |
add.s64 %rd469, %rd468, %rd149; | |
ld.shared.v4.f32 {%f609, %f610, %f611, %f612}, [%rd469]; | |
or.b64 %rd470, %rd403, 180224; | |
add.s64 %rd471, %rd88, %rd470; | |
add.s64 %rd472, %rd471, %rd154; | |
st.global.v4.f32 [%rd472], {%f609, %f610, %f611, %f612}; | |
ld.shared.v4.f32 {%f613, %f614, %f615, %f616}, [%rd401+49152]; | |
or.b64 %rd473, %rd403, 196608; | |
add.s64 %rd474, %rd88, %rd473; | |
add.s64 %rd475, %rd474, %rd154; | |
st.global.v4.f32 [%rd475], {%f613, %f614, %f615, %f616}; | |
ld.shared.v4.f32 {%f617, %f618, %f619, %f620}, [%rd401+53248]; | |
or.b64 %rd476, %rd403, 212992; | |
add.s64 %rd477, %rd88, %rd476; | |
add.s64 %rd478, %rd477, %rd154; | |
st.global.v4.f32 [%rd478], {%f617, %f618, %f619, %f620}; | |
ld.shared.v4.f32 {%f621, %f622, %f623, %f624}, [%rd401+57344]; | |
or.b64 %rd479, %rd403, 229376; | |
add.s64 %rd480, %rd88, %rd479; | |
add.s64 %rd481, %rd480, %rd154; | |
st.global.v4.f32 [%rd481], {%f621, %f622, %f623, %f624}; | |
ld.shared.v4.f32 {%f625, %f626, %f627, %f628}, [%rd401+61440]; | |
or.b64 %rd482, %rd403, 245760; | |
add.s64 %rd483, %rd88, %rd482; | |
add.s64 %rd484, %rd483, %rd154; | |
st.global.v4.f32 [%rd484], {%f625, %f626, %f627, %f628}; | |
ld.shared.v4.f32 {%f629, %f630, %f631, %f632}, [%rd401+65536]; | |
or.b64 %rd485, %rd403, 262144; | |
add.s64 %rd486, %rd88, %rd485; | |
add.s64 %rd487, %rd486, %rd154; | |
st.global.v4.f32 [%rd487], {%f629, %f630, %f631, %f632}; | |
ld.shared.v4.f32 {%f633, %f634, %f635, %f636}, [%rd401+69632]; | |
or.b64 %rd488, %rd403, 278528; | |
add.s64 %rd489, %rd88, %rd488; | |
add.s64 %rd490, %rd489, %rd154; | |
st.global.v4.f32 [%rd490], {%f633, %f634, %f635, %f636}; | |
ld.shared.v4.f32 {%f637, %f638, %f639, %f640}, [%rd401+73728]; | |
or.b64 %rd491, %rd403, 294912; | |
add.s64 %rd492, %rd88, %rd491; | |
add.s64 %rd493, %rd492, %rd154; | |
st.global.v4.f32 [%rd493], {%f637, %f638, %f639, %f640}; | |
ld.shared.v4.f32 {%f641, %f642, %f643, %f644}, [%rd401+77824]; | |
or.b64 %rd494, %rd403, 311296; | |
add.s64 %rd495, %rd88, %rd494; | |
add.s64 %rd496, %rd495, %rd154; | |
st.global.v4.f32 [%rd496], {%f641, %f642, %f643, %f644}; | |
ld.shared.v4.f32 {%f645, %f646, %f647, %f648}, [%rd401+81920]; | |
or.b64 %rd497, %rd403, 327680; | |
add.s64 %rd498, %rd88, %rd497; | |
add.s64 %rd499, %rd498, %rd154; | |
st.global.v4.f32 [%rd499], {%f645, %f646, %f647, %f648}; | |
ld.shared.v4.f32 {%f649, %f650, %f651, %f652}, [%rd401+86016]; | |
or.b64 %rd500, %rd403, 344064; | |
add.s64 %rd501, %rd88, %rd500; | |
add.s64 %rd502, %rd501, %rd154; | |
st.global.v4.f32 [%rd502], {%f649, %f650, %f651, %f652}; | |
ld.shared.v4.f32 {%f653, %f654, %f655, %f656}, [%rd401+90112]; | |
or.b64 %rd503, %rd403, 360448; | |
add.s64 %rd504, %rd88, %rd503; | |
add.s64 %rd505, %rd504, %rd154; | |
st.global.v4.f32 [%rd505], {%f653, %f654, %f655, %f656}; | |
ld.shared.v4.f32 {%f657, %f658, %f659, %f660}, [%rd401+94208]; | |
or.b64 %rd506, %rd403, 376832; | |
add.s64 %rd507, %rd88, %rd506; | |
add.s64 %rd508, %rd507, %rd154; | |
st.global.v4.f32 [%rd508], {%f657, %f658, %f659, %f660}; | |
ld.shared.v4.f32 {%f661, %f662, %f663, %f664}, [%rd401+98304]; | |
or.b64 %rd509, %rd403, 393216; | |
add.s64 %rd510, %rd88, %rd509; | |
add.s64 %rd511, %rd510, %rd154; | |
st.global.v4.f32 [%rd511], {%f661, %f662, %f663, %f664}; | |
ld.shared.v4.f32 {%f665, %f666, %f667, %f668}, [%rd401+102400]; | |
or.b64 %rd512, %rd403, 409600; | |
add.s64 %rd513, %rd88, %rd512; | |
add.s64 %rd514, %rd513, %rd154; | |
st.global.v4.f32 [%rd514], {%f665, %f666, %f667, %f668}; | |
ld.shared.v4.f32 {%f669, %f670, %f671, %f672}, [%rd401+106496]; | |
or.b64 %rd515, %rd403, 425984; | |
add.s64 %rd516, %rd88, %rd515; | |
add.s64 %rd517, %rd516, %rd154; | |
st.global.v4.f32 [%rd517], {%f669, %f670, %f671, %f672}; | |
ld.shared.v4.f32 {%f673, %f674, %f675, %f676}, [%rd401+110592]; | |
or.b64 %rd518, %rd403, 442368; | |
add.s64 %rd519, %rd88, %rd518; | |
add.s64 %rd520, %rd519, %rd154; | |
st.global.v4.f32 [%rd520], {%f673, %f674, %f675, %f676}; | |
ld.shared.v4.f32 {%f677, %f678, %f679, %f680}, [%rd401+114688]; | |
or.b64 %rd521, %rd403, 458752; | |
add.s64 %rd522, %rd88, %rd521; | |
add.s64 %rd523, %rd522, %rd154; | |
st.global.v4.f32 [%rd523], {%f677, %f678, %f679, %f680}; | |
ld.shared.v4.f32 {%f681, %f682, %f683, %f684}, [%rd401+118784]; | |
or.b64 %rd524, %rd403, 475136; | |
add.s64 %rd525, %rd88, %rd524; | |
add.s64 %rd526, %rd525, %rd154; | |
st.global.v4.f32 [%rd526], {%f681, %f682, %f683, %f684}; | |
ld.shared.v4.f32 {%f685, %f686, %f687, %f688}, [%rd401+122880]; | |
or.b64 %rd527, %rd403, 491520; | |
add.s64 %rd528, %rd88, %rd527; | |
add.s64 %rd529, %rd528, %rd154; | |
st.global.v4.f32 [%rd529], {%f685, %f686, %f687, %f688}; | |
ld.shared.v4.f32 {%f689, %f690, %f691, %f692}, [%rd401+126976]; | |
or.b64 %rd530, %rd403, 507904; | |
add.s64 %rd531, %rd88, %rd530; | |
add.s64 %rd532, %rd531, %rd154; | |
st.global.v4.f32 [%rd532], {%f689, %f690, %f691, %f692}; | |
bar.sync 0; | |
ret; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 7.6 | |
.target sm_80 | |
.address_size 64 | |
// .globl matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32 | |
.extern .shared .align 16 .b8 __dynamic_shared_memory__[]; | |
.visible .entry matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32( | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0, | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1, | |
.param .u64 matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2 | |
) | |
.maxntid 128, 2, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b32 %r<143>; | |
.reg .f32 %f<837>; | |
.reg .b64 %rd<438>; | |
ld.param.u64 %rd82, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_2]; | |
ld.param.u64 %rd88, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_0]; | |
ld.param.u64 %rd89, [matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32_param_1]; | |
mov.u32 %r49, %tid.x; | |
cvt.u64.u32 %rd90, %r49; | |
mov.u32 %r50, %tid.y; | |
mov.u32 %r51, %ctaid.y; | |
mov.u32 %r52, %ctaid.x; | |
shl.b32 %r53, %r51, 7; | |
cvt.u64.u32 %rd1, %r53; | |
mul.wide.u32 %rd91, %r50, 32; | |
shr.u64 %rd92, %rd90, 2; | |
or.b64 %rd93, %rd92, %rd91; | |
or.b64 %rd94, %rd93, %rd1; | |
mul.wide.u32 %rd95, %r49, 4; | |
shl.b64 %rd96, %rd92, 4; | |
sub.s64 %rd97, %rd95, %rd96; | |
shr.u64 %rd98, %rd90, 1; | |
and.b64 %rd99, %rd98, 12; | |
xor.b64 %rd100, %rd97, %rd99; | |
mul.wide.u32 %rd101, %r52, 256; | |
shr.u64 %rd102, %rd90, 6; | |
shl.b64 %rd103, %rd102, 8; | |
sub.s64 %rd2, %rd95, %rd103; | |
add.s64 %rd3, %rd2, %rd101; | |
mul.wide.u32 %rd104, %r50, 2; | |
or.b64 %rd4, %rd104, %rd102; | |
or.b64 %rd105, %rd4, 4; | |
or.b64 %rd106, %rd4, 8; | |
or.b64 %rd107, %rd4, 12; | |
mov.u32 %r54, %laneid; | |
cvt.s64.s32 %rd108, %r54; | |
mul.wide.u32 %rd5, %r50, 64; | |
add.s64 %rd109, %rd5, %rd108; | |
shr.s32 %r55, %r54, 31; | |
xor.b32 %r56, %r55, %r54; | |
shr.s32 %r57, %r56, 31; | |
shr.u32 %r58, %r57, 28; | |
add.s32 %r59, %r56, %r58; | |
shr.s32 %r60, %r59, 4; | |
xor.b32 %r61, %r60, %r55; | |
mul.wide.s32 %rd110, %r61, 4; | |
mul.wide.s32 %rd6, %r54, 2; | |
and.b64 %rd111, %rd6, 12; | |
xor.b64 %rd7, %rd110, %rd111; | |
add.s64 %rd112, %rd110, 8; | |
xor.b64 %rd8, %rd112, %rd111; | |
shr.u32 %r62, %r57, 30; | |
add.s32 %r63, %r56, %r62; | |
shr.s32 %r64, %r63, 2; | |
xor.b32 %r65, %r64, %r55; | |
cvt.s64.s32 %rd9, %r65; | |
mul.wide.u32 %rd113, %r49, 2; | |
and.b64 %rd10, %rd113, 192; | |
add.s64 %rd114, %rd10, %rd9; | |
and.b64 %rd115, %rd108, 3; | |
shl.b64 %rd116, %rd115, 2; | |
xor.b64 %rd11, %rd114, %rd116; | |
or.b64 %rd117, %rd115, 4; | |
shl.b64 %rd118, %rd117, 2; | |
xor.b64 %rd12, %rd114, %rd118; | |
or.b64 %rd119, %rd115, 8; | |
shl.b64 %rd120, %rd119, 2; | |
xor.b64 %rd13, %rd114, %rd120; | |
or.b64 %rd121, %rd115, 12; | |
shl.b64 %rd122, %rd121, 2; | |
xor.b64 %rd14, %rd114, %rd122; | |
add.s64 %rd123, %rd114, 8; | |
xor.b64 %rd15, %rd123, %rd116; | |
xor.b64 %rd16, %rd123, %rd118; | |
xor.b64 %rd17, %rd123, %rd120; | |
xor.b64 %rd18, %rd123, %rd122; | |
add.s64 %rd124, %rd114, 16; | |
xor.b64 %rd19, %rd124, %rd116; | |
xor.b64 %rd20, %rd124, %rd118; | |
xor.b64 %rd21, %rd124, %rd120; | |
xor.b64 %rd22, %rd124, %rd122; | |
add.s64 %rd125, %rd114, 24; | |
xor.b64 %rd23, %rd125, %rd116; | |
xor.b64 %rd24, %rd125, %rd118; | |
xor.b64 %rd25, %rd125, %rd120; | |
xor.b64 %rd26, %rd125, %rd122; | |
add.s64 %rd126, %rd114, 32; | |
xor.b64 %rd27, %rd126, %rd116; | |
xor.b64 %rd28, %rd126, %rd118; | |
xor.b64 %rd29, %rd126, %rd120; | |
xor.b64 %rd30, %rd126, %rd122; | |
add.s64 %rd127, %rd114, 40; | |
xor.b64 %rd31, %rd127, %rd116; | |
xor.b64 %rd32, %rd127, %rd118; | |
xor.b64 %rd33, %rd127, %rd120; | |
xor.b64 %rd34, %rd127, %rd122; | |
add.s64 %rd128, %rd114, 48; | |
xor.b64 %rd35, %rd128, %rd116; | |
xor.b64 %rd36, %rd128, %rd118; | |
xor.b64 %rd37, %rd128, %rd120; | |
xor.b64 %rd38, %rd128, %rd122; | |
add.s64 %rd129, %rd114, 56; | |
xor.b64 %rd39, %rd129, %rd116; | |
xor.b64 %rd40, %rd129, %rd118; | |
xor.b64 %rd41, %rd129, %rd120; | |
xor.b64 %rd42, %rd129, %rd122; | |
shl.b64 %rd43, %rd93, 4; | |
shl.b64 %rd130, %rd93, 6; | |
mov.u64 %rd131, __dynamic_shared_memory__; | |
shl.b64 %rd132, %rd100, 2; | |
add.s64 %rd60, %rd131, %rd132; | |
add.s64 %rd44, %rd60, %rd130; | |
shl.b64 %rd133, %rd94, 13; | |
add.s64 %rd134, %rd88, %rd133; | |
shl.b64 %rd135, %rd97, 2; | |
add.s64 %rd136, %rd134, %rd135; | |
cp.async.cg.shared.global [%rd44], [%rd136], 16; | |
add.s64 %rd137, %rd44, 4096; | |
add.s64 %rd138, %rd136, 524288; | |
cp.async.cg.shared.global [%rd137], [%rd138], 16; | |
shl.b64 %rd45, %rd4, 8; | |
add.s64 %rd139, %rd131, 32768; | |
shl.b64 %rd140, %rd4, 4; | |
shl.b64 %rd141, %rd2, 2; | |
xor.b64 %rd142, %rd140, %rd141; | |
shl.b64 %rd143, %rd4, 10; | |
add.s64 %rd61, %rd139, %rd142; | |
add.s64 %rd144, %rd61, %rd143; | |
shl.b64 %rd145, %rd4, 12; | |
add.s64 %rd146, %rd89, %rd145; | |
shl.b64 %rd147, %rd3, 2; | |
add.s64 %rd148, %rd146, %rd147; | |
cp.async.cg.shared.global [%rd144], [%rd148], 16; | |
shl.b64 %rd46, %rd105, 8; | |
shl.b64 %rd149, %rd105, 4; | |
xor.b64 %rd150, %rd149, %rd141; | |
shl.b64 %rd151, %rd105, 10; | |
add.s64 %rd62, %rd139, %rd150; | |
add.s64 %rd152, %rd62, %rd151; | |
shl.b64 %rd153, %rd105, 12; | |
add.s64 %rd154, %rd89, %rd153; | |
add.s64 %rd155, %rd154, %rd147; | |
cp.async.cg.shared.global [%rd152], [%rd155], 16; | |
shl.b64 %rd47, %rd106, 8; | |
shl.b64 %rd156, %rd106, 4; | |
xor.b64 %rd157, %rd156, %rd141; | |
shl.b64 %rd158, %rd106, 10; | |
add.s64 %rd63, %rd139, %rd157; | |
add.s64 %rd159, %rd63, %rd158; | |
shl.b64 %rd160, %rd106, 12; | |
add.s64 %rd161, %rd89, %rd160; | |
add.s64 %rd162, %rd161, %rd147; | |
cp.async.cg.shared.global [%rd159], [%rd162], 16; | |
shl.b64 %rd48, %rd107, 8; | |
shl.b64 %rd163, %rd107, 4; | |
xor.b64 %rd164, %rd163, %rd141; | |
shl.b64 %rd165, %rd107, 10; | |
add.s64 %rd64, %rd139, %rd164; | |
add.s64 %rd166, %rd64, %rd165; | |
shl.b64 %rd167, %rd107, 12; | |
add.s64 %rd168, %rd89, %rd167; | |
add.s64 %rd169, %rd168, %rd147; | |
cp.async.cg.shared.global [%rd166], [%rd169], 16; | |
cp.async.commit_group; | |
add.s64 %rd170, %rd44, 8192; | |
add.s64 %rd171, %rd136, 64; | |
cp.async.cg.shared.global [%rd170], [%rd171], 16; | |
add.s64 %rd172, %rd44, 12288; | |
add.s64 %rd173, %rd136, 524352; | |
cp.async.cg.shared.global [%rd172], [%rd173], 16; | |
or.b64 %rd49, %rd4, 16; | |
add.s64 %rd174, %rd144, 16384; | |
shl.b64 %rd175, %rd49, 12; | |
add.s64 %rd176, %rd89, %rd175; | |
add.s64 %rd177, %rd176, %rd147; | |
cp.async.cg.shared.global [%rd174], [%rd177], 16; | |
or.b64 %rd50, %rd4, 20; | |
add.s64 %rd178, %rd152, 16384; | |
shl.b64 %rd179, %rd50, 12; | |
add.s64 %rd180, %rd89, %rd179; | |
add.s64 %rd181, %rd180, %rd147; | |
cp.async.cg.shared.global [%rd178], [%rd181], 16; | |
or.b64 %rd51, %rd4, 24; | |
add.s64 %rd182, %rd159, 16384; | |
shl.b64 %rd183, %rd51, 12; | |
add.s64 %rd184, %rd89, %rd183; | |
add.s64 %rd185, %rd184, %rd147; | |
cp.async.cg.shared.global [%rd182], [%rd185], 16; | |
or.b64 %rd52, %rd4, 28; | |
add.s64 %rd186, %rd166, 16384; | |
shl.b64 %rd187, %rd52, 12; | |
add.s64 %rd188, %rd89, %rd187; | |
add.s64 %rd189, %rd188, %rd147; | |
cp.async.cg.shared.global [%rd186], [%rd189], 16; | |
cp.async.commit_group; | |
add.s64 %rd190, %rd44, 16384; | |
add.s64 %rd191, %rd136, 128; | |
cp.async.cg.shared.global [%rd190], [%rd191], 16; | |
add.s64 %rd192, %rd44, 20480; | |
add.s64 %rd193, %rd136, 524416; | |
cp.async.cg.shared.global [%rd192], [%rd193], 16; | |
or.b64 %rd53, %rd4, 32; | |
add.s64 %rd194, %rd144, 32768; | |
shl.b64 %rd195, %rd53, 12; | |
add.s64 %rd196, %rd89, %rd195; | |
add.s64 %rd197, %rd196, %rd147; | |
cp.async.cg.shared.global [%rd194], [%rd197], 16; | |
or.b64 %rd54, %rd4, 36; | |
add.s64 %rd198, %rd152, 32768; | |
shl.b64 %rd199, %rd54, 12; | |
add.s64 %rd200, %rd89, %rd199; | |
add.s64 %rd201, %rd200, %rd147; | |
cp.async.cg.shared.global [%rd198], [%rd201], 16; | |
or.b64 %rd55, %rd4, 40; | |
add.s64 %rd202, %rd159, 32768; | |
shl.b64 %rd203, %rd55, 12; | |
add.s64 %rd204, %rd89, %rd203; | |
add.s64 %rd205, %rd204, %rd147; | |
cp.async.cg.shared.global [%rd202], [%rd205], 16; | |
or.b64 %rd56, %rd4, 44; | |
add.s64 %rd206, %rd166, 32768; | |
shl.b64 %rd207, %rd56, 12; | |
add.s64 %rd208, %rd89, %rd207; | |
add.s64 %rd209, %rd208, %rd147; | |
cp.async.cg.shared.global [%rd206], [%rd209], 16; | |
cp.async.commit_group; | |
cp.async.wait_group 2; | |
bar.sync 0; | |
mul.wide.s32 %rd210, %r61, 256; | |
shl.b64 %rd211, %rd109, 4; | |
sub.s64 %rd57, %rd211, %rd210; | |
shl.b64 %rd212, %rd57, 2; | |
add.s64 %rd213, %rd131, %rd212; | |
shl.b64 %rd214, %rd7, 2; | |
add.s64 %rd215, %rd213, %rd214; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd215]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd215+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd215+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd215+3072]; | |
shl.b64 %rd58, %rd115, 8; | |
shl.b64 %rd216, %rd115, 10; | |
add.s64 %rd217, %rd139, %rd216; | |
shl.b64 %rd218, %rd11, 2; | |
add.s64 %rd219, %rd217, %rd218; | |
ld.shared.f32 %f708, [%rd219]; | |
shl.b64 %rd59, %rd117, 8; | |
shl.b64 %rd220, %rd117, 10; | |
add.s64 %rd221, %rd139, %rd220; | |
shl.b64 %rd222, %rd12, 2; | |
add.s64 %rd223, %rd221, %rd222; | |
ld.shared.f32 %f707, [%rd223]; | |
shl.b64 %rd224, %rd15, 2; | |
add.s64 %rd225, %rd217, %rd224; | |
ld.shared.f32 %f706, [%rd225]; | |
shl.b64 %rd226, %rd16, 2; | |
add.s64 %rd227, %rd221, %rd226; | |
ld.shared.f32 %f705, [%rd227]; | |
shl.b64 %rd228, %rd19, 2; | |
add.s64 %rd229, %rd217, %rd228; | |
ld.shared.f32 %f704, [%rd229]; | |
shl.b64 %rd230, %rd20, 2; | |
add.s64 %rd231, %rd221, %rd230; | |
ld.shared.f32 %f703, [%rd231]; | |
shl.b64 %rd232, %rd23, 2; | |
add.s64 %rd233, %rd217, %rd232; | |
ld.shared.f32 %f702, [%rd233]; | |
shl.b64 %rd234, %rd24, 2; | |
add.s64 %rd235, %rd221, %rd234; | |
ld.shared.f32 %f701, [%rd235]; | |
shl.b64 %rd236, %rd27, 2; | |
add.s64 %rd237, %rd217, %rd236; | |
ld.shared.f32 %f700, [%rd237]; | |
shl.b64 %rd238, %rd28, 2; | |
add.s64 %rd239, %rd221, %rd238; | |
ld.shared.f32 %f699, [%rd239]; | |
shl.b64 %rd240, %rd31, 2; | |
add.s64 %rd241, %rd217, %rd240; | |
ld.shared.f32 %f698, [%rd241]; | |
shl.b64 %rd242, %rd32, 2; | |
add.s64 %rd243, %rd221, %rd242; | |
ld.shared.f32 %f697, [%rd243]; | |
shl.b64 %rd244, %rd35, 2; | |
add.s64 %rd245, %rd217, %rd244; | |
ld.shared.f32 %f696, [%rd245]; | |
shl.b64 %rd246, %rd36, 2; | |
add.s64 %rd247, %rd221, %rd246; | |
ld.shared.f32 %f695, [%rd247]; | |
shl.b64 %rd248, %rd39, 2; | |
add.s64 %rd249, %rd217, %rd248; | |
ld.shared.f32 %f694, [%rd249]; | |
shl.b64 %rd250, %rd40, 2; | |
add.s64 %rd251, %rd221, %rd250; | |
ld.shared.f32 %f693, [%rd251]; | |
shl.b64 %rd65, %rd119, 8; | |
shl.b64 %rd66, %rd121, 8; | |
mul.wide.u32 %rd67, %r49, 16; | |
mul.wide.u32 %rd252, %r51, 1048576; | |
mul.wide.u32 %rd253, %r50, 262144; | |
add.s64 %rd254, %rd252, %rd253; | |
mul.lo.s64 %rd255, %rd92, 8128; | |
add.s64 %rd256, %rd254, %rd255; | |
add.s64 %rd434, %rd88, %rd256; | |
mul.wide.u32 %rd257, %r50, 8192; | |
mul.lo.s64 %rd258, %rd102, 3072; | |
add.s64 %rd259, %rd257, %rd258; | |
mul.wide.u32 %rd260, %r52, 1024; | |
add.s64 %rd261, %rd259, %rd260; | |
add.s64 %rd432, %rd89, %rd261; | |
mov.f32 %f709, 0f00000000; | |
mov.u64 %rd437, 0; | |
mov.u64 %rd436, 1; | |
mov.u64 %rd435, 2; | |
mov.u64 %rd433, -16; | |
mov.u64 %rd431, 3; | |
shl.b64 %rd273, %rd8, 2; | |
shl.b64 %rd280, %rd13, 2; | |
shl.b64 %rd285, %rd14, 2; | |
shl.b64 %rd287, %rd17, 2; | |
shl.b64 %rd289, %rd18, 2; | |
shl.b64 %rd291, %rd21, 2; | |
shl.b64 %rd293, %rd22, 2; | |
shl.b64 %rd295, %rd25, 2; | |
shl.b64 %rd297, %rd26, 2; | |
shl.b64 %rd299, %rd29, 2; | |
shl.b64 %rd301, %rd30, 2; | |
shl.b64 %rd303, %rd33, 2; | |
shl.b64 %rd305, %rd34, 2; | |
shl.b64 %rd307, %rd37, 2; | |
shl.b64 %rd309, %rd38, 2; | |
shl.b64 %rd311, %rd41, 2; | |
shl.b64 %rd313, %rd42, 2; | |
mov.f32 %f710, %f709; | |
mov.f32 %f711, %f709; | |
mov.f32 %f712, %f709; | |
mov.f32 %f713, %f709; | |
mov.f32 %f714, %f709; | |
mov.f32 %f715, %f709; | |
mov.f32 %f716, %f709; | |
mov.f32 %f717, %f709; | |
mov.f32 %f718, %f709; | |
mov.f32 %f719, %f709; | |
mov.f32 %f720, %f709; | |
mov.f32 %f721, %f709; | |
mov.f32 %f722, %f709; | |
mov.f32 %f723, %f709; | |
mov.f32 %f724, %f709; | |
mov.f32 %f725, %f709; | |
mov.f32 %f726, %f709; | |
mov.f32 %f727, %f709; | |
mov.f32 %f728, %f709; | |
mov.f32 %f729, %f709; | |
mov.f32 %f730, %f709; | |
mov.f32 %f731, %f709; | |
mov.f32 %f732, %f709; | |
mov.f32 %f733, %f709; | |
mov.f32 %f734, %f709; | |
mov.f32 %f735, %f709; | |
mov.f32 %f736, %f709; | |
mov.f32 %f737, %f709; | |
mov.f32 %f738, %f709; | |
mov.f32 %f739, %f709; | |
mov.f32 %f740, %f709; | |
mov.f32 %f741, %f709; | |
mov.f32 %f742, %f709; | |
mov.f32 %f743, %f709; | |
mov.f32 %f744, %f709; | |
mov.f32 %f745, %f709; | |
mov.f32 %f746, %f709; | |
mov.f32 %f747, %f709; | |
mov.f32 %f748, %f709; | |
mov.f32 %f749, %f709; | |
mov.f32 %f750, %f709; | |
mov.f32 %f751, %f709; | |
mov.f32 %f752, %f709; | |
mov.f32 %f753, %f709; | |
mov.f32 %f754, %f709; | |
mov.f32 %f755, %f709; | |
mov.f32 %f756, %f709; | |
mov.f32 %f757, %f709; | |
mov.f32 %f758, %f709; | |
mov.f32 %f759, %f709; | |
mov.f32 %f760, %f709; | |
mov.f32 %f761, %f709; | |
mov.f32 %f762, %f709; | |
mov.f32 %f763, %f709; | |
mov.f32 %f764, %f709; | |
mov.f32 %f765, %f709; | |
mov.f32 %f766, %f709; | |
mov.f32 %f767, %f709; | |
mov.f32 %f768, %f709; | |
mov.f32 %f769, %f709; | |
mov.f32 %f770, %f709; | |
mov.f32 %f771, %f709; | |
mov.f32 %f772, %f709; | |
mov.f32 %f773, %f709; | |
mov.f32 %f774, %f709; | |
mov.f32 %f775, %f709; | |
mov.f32 %f776, %f709; | |
mov.f32 %f777, %f709; | |
mov.f32 %f778, %f709; | |
mov.f32 %f779, %f709; | |
mov.f32 %f780, %f709; | |
mov.f32 %f781, %f709; | |
mov.f32 %f782, %f709; | |
mov.f32 %f783, %f709; | |
mov.f32 %f784, %f709; | |
mov.f32 %f785, %f709; | |
mov.f32 %f786, %f709; | |
mov.f32 %f787, %f709; | |
mov.f32 %f788, %f709; | |
mov.f32 %f789, %f709; | |
mov.f32 %f790, %f709; | |
mov.f32 %f791, %f709; | |
mov.f32 %f792, %f709; | |
mov.f32 %f793, %f709; | |
mov.f32 %f794, %f709; | |
mov.f32 %f795, %f709; | |
mov.f32 %f796, %f709; | |
mov.f32 %f797, %f709; | |
mov.f32 %f798, %f709; | |
mov.f32 %f799, %f709; | |
mov.f32 %f800, %f709; | |
mov.f32 %f801, %f709; | |
mov.f32 %f802, %f709; | |
mov.f32 %f803, %f709; | |
mov.f32 %f804, %f709; | |
mov.f32 %f805, %f709; | |
mov.f32 %f806, %f709; | |
mov.f32 %f807, %f709; | |
mov.f32 %f808, %f709; | |
mov.f32 %f809, %f709; | |
mov.f32 %f810, %f709; | |
mov.f32 %f811, %f709; | |
mov.f32 %f812, %f709; | |
mov.f32 %f813, %f709; | |
mov.f32 %f814, %f709; | |
mov.f32 %f815, %f709; | |
mov.f32 %f816, %f709; | |
mov.f32 %f817, %f709; | |
mov.f32 %f818, %f709; | |
mov.f32 %f819, %f709; | |
mov.f32 %f820, %f709; | |
mov.f32 %f821, %f709; | |
mov.f32 %f822, %f709; | |
mov.f32 %f823, %f709; | |
mov.f32 %f824, %f709; | |
mov.f32 %f825, %f709; | |
mov.f32 %f826, %f709; | |
mov.f32 %f827, %f709; | |
mov.f32 %f828, %f709; | |
mov.f32 %f829, %f709; | |
mov.f32 %f830, %f709; | |
mov.f32 %f831, %f709; | |
mov.f32 %f832, %f709; | |
mov.f32 %f833, %f709; | |
mov.f32 %f834, %f709; | |
mov.f32 %f835, %f709; | |
mov.f32 %f836, %f709; | |
$L__BB0_1: | |
mov.u64 %rd75, %rd436; | |
mov.u64 %rd436, %rd435; | |
add.s64 %rd433, %rd433, 16; | |
setp.lt.u64 %p1, %rd433, 2000; | |
shl.b64 %rd268, %rd437, 13; | |
add.s64 %rd270, %rd131, %rd268; | |
add.s64 %rd272, %rd270, %rd212; | |
add.s64 %rd274, %rd272, %rd273; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r78, %r79, %r80, %r81}, [%rd274]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r82, %r83, %r84, %r85}, [%rd274+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r86, %r87, %r88, %r89}, [%rd274+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r90, %r91, %r92, %r93}, [%rd274+3072]; | |
shl.b64 %rd275, %rd437, 12; | |
or.b64 %rd276, %rd275, %rd65; | |
shl.b64 %rd277, %rd276, 2; | |
add.s64 %rd279, %rd139, %rd277; | |
add.s64 %rd281, %rd279, %rd280; | |
ld.shared.u32 %r94, [%rd281]; | |
or.b64 %rd282, %rd275, %rd66; | |
shl.b64 %rd283, %rd282, 2; | |
add.s64 %rd284, %rd139, %rd283; | |
add.s64 %rd286, %rd284, %rd285; | |
ld.shared.u32 %r95, [%rd286]; | |
add.s64 %rd288, %rd279, %rd287; | |
ld.shared.u32 %r96, [%rd288]; | |
add.s64 %rd290, %rd284, %rd289; | |
ld.shared.u32 %r97, [%rd290]; | |
add.s64 %rd292, %rd279, %rd291; | |
ld.shared.u32 %r98, [%rd292]; | |
add.s64 %rd294, %rd284, %rd293; | |
ld.shared.u32 %r99, [%rd294]; | |
add.s64 %rd296, %rd279, %rd295; | |
ld.shared.u32 %r100, [%rd296]; | |
add.s64 %rd298, %rd284, %rd297; | |
ld.shared.u32 %r101, [%rd298]; | |
add.s64 %rd300, %rd279, %rd299; | |
ld.shared.u32 %r102, [%rd300]; | |
add.s64 %rd302, %rd284, %rd301; | |
ld.shared.u32 %r103, [%rd302]; | |
add.s64 %rd304, %rd279, %rd303; | |
ld.shared.u32 %r104, [%rd304]; | |
add.s64 %rd306, %rd284, %rd305; | |
ld.shared.u32 %r105, [%rd306]; | |
add.s64 %rd308, %rd279, %rd307; | |
ld.shared.u32 %r106, [%rd308]; | |
add.s64 %rd310, %rd284, %rd309; | |
ld.shared.u32 %r107, [%rd310]; | |
add.s64 %rd312, %rd279, %rd311; | |
ld.shared.u32 %r108, [%rd312]; | |
add.s64 %rd314, %rd284, %rd313; | |
ld.shared.u32 %r109, [%rd314]; | |
mov.b32 %r110, %f708; | |
mov.b32 %r111, %f707; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f437, %f438, %f439, %f440}, | |
{%r139, %r140, %r141, %r142}, | |
{%r110, %r111}, | |
{%f833, %f834, %f835, %f836}; | |
mov.b32 %r112, %f706; | |
mov.b32 %r113, %f705; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f441, %f442, %f443, %f444}, | |
{%r139, %r140, %r141, %r142}, | |
{%r112, %r113}, | |
{%f829, %f830, %f831, %f832}; | |
mov.b32 %r114, %f704; | |
mov.b32 %r115, %f703; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f445, %f446, %f447, %f448}, | |
{%r139, %r140, %r141, %r142}, | |
{%r114, %r115}, | |
{%f825, %f826, %f827, %f828}; | |
mov.b32 %r116, %f702; | |
mov.b32 %r117, %f701; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f449, %f450, %f451, %f452}, | |
{%r139, %r140, %r141, %r142}, | |
{%r116, %r117}, | |
{%f821, %f822, %f823, %f824}; | |
mov.b32 %r118, %f700; | |
mov.b32 %r119, %f699; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f453, %f454, %f455, %f456}, | |
{%r139, %r140, %r141, %r142}, | |
{%r118, %r119}, | |
{%f817, %f818, %f819, %f820}; | |
mov.b32 %r120, %f698; | |
mov.b32 %r121, %f697; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f457, %f458, %f459, %f460}, | |
{%r139, %r140, %r141, %r142}, | |
{%r120, %r121}, | |
{%f813, %f814, %f815, %f816}; | |
mov.b32 %r122, %f696; | |
mov.b32 %r123, %f695; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f461, %f462, %f463, %f464}, | |
{%r139, %r140, %r141, %r142}, | |
{%r122, %r123}, | |
{%f809, %f810, %f811, %f812}; | |
mov.b32 %r124, %f694; | |
mov.b32 %r125, %f693; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f465, %f466, %f467, %f468}, | |
{%r139, %r140, %r141, %r142}, | |
{%r124, %r125}, | |
{%f805, %f806, %f807, %f808}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f469, %f470, %f471, %f472}, | |
{%r135, %r136, %r137, %r138}, | |
{%r110, %r111}, | |
{%f801, %f802, %f803, %f804}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f473, %f474, %f475, %f476}, | |
{%r135, %r136, %r137, %r138}, | |
{%r112, %r113}, | |
{%f797, %f798, %f799, %f800}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f477, %f478, %f479, %f480}, | |
{%r135, %r136, %r137, %r138}, | |
{%r114, %r115}, | |
{%f793, %f794, %f795, %f796}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f481, %f482, %f483, %f484}, | |
{%r135, %r136, %r137, %r138}, | |
{%r116, %r117}, | |
{%f789, %f790, %f791, %f792}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f485, %f486, %f487, %f488}, | |
{%r135, %r136, %r137, %r138}, | |
{%r118, %r119}, | |
{%f785, %f786, %f787, %f788}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f489, %f490, %f491, %f492}, | |
{%r135, %r136, %r137, %r138}, | |
{%r120, %r121}, | |
{%f781, %f782, %f783, %f784}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f493, %f494, %f495, %f496}, | |
{%r135, %r136, %r137, %r138}, | |
{%r122, %r123}, | |
{%f777, %f778, %f779, %f780}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f497, %f498, %f499, %f500}, | |
{%r135, %r136, %r137, %r138}, | |
{%r124, %r125}, | |
{%f773, %f774, %f775, %f776}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f501, %f502, %f503, %f504}, | |
{%r131, %r132, %r133, %r134}, | |
{%r110, %r111}, | |
{%f769, %f770, %f771, %f772}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f505, %f506, %f507, %f508}, | |
{%r131, %r132, %r133, %r134}, | |
{%r112, %r113}, | |
{%f765, %f766, %f767, %f768}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f509, %f510, %f511, %f512}, | |
{%r131, %r132, %r133, %r134}, | |
{%r114, %r115}, | |
{%f761, %f762, %f763, %f764}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f513, %f514, %f515, %f516}, | |
{%r131, %r132, %r133, %r134}, | |
{%r116, %r117}, | |
{%f757, %f758, %f759, %f760}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f517, %f518, %f519, %f520}, | |
{%r131, %r132, %r133, %r134}, | |
{%r118, %r119}, | |
{%f753, %f754, %f755, %f756}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f521, %f522, %f523, %f524}, | |
{%r131, %r132, %r133, %r134}, | |
{%r120, %r121}, | |
{%f749, %f750, %f751, %f752}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f525, %f526, %f527, %f528}, | |
{%r131, %r132, %r133, %r134}, | |
{%r122, %r123}, | |
{%f745, %f746, %f747, %f748}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f529, %f530, %f531, %f532}, | |
{%r131, %r132, %r133, %r134}, | |
{%r124, %r125}, | |
{%f741, %f742, %f743, %f744}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f533, %f534, %f535, %f536}, | |
{%r127, %r128, %r129, %r130}, | |
{%r110, %r111}, | |
{%f737, %f738, %f739, %f740}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f537, %f538, %f539, %f540}, | |
{%r127, %r128, %r129, %r130}, | |
{%r112, %r113}, | |
{%f733, %f734, %f735, %f736}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f541, %f542, %f543, %f544}, | |
{%r127, %r128, %r129, %r130}, | |
{%r114, %r115}, | |
{%f729, %f730, %f731, %f732}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f545, %f546, %f547, %f548}, | |
{%r127, %r128, %r129, %r130}, | |
{%r116, %r117}, | |
{%f725, %f726, %f727, %f728}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f549, %f550, %f551, %f552}, | |
{%r127, %r128, %r129, %r130}, | |
{%r118, %r119}, | |
{%f721, %f722, %f723, %f724}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f553, %f554, %f555, %f556}, | |
{%r127, %r128, %r129, %r130}, | |
{%r120, %r121}, | |
{%f717, %f718, %f719, %f720}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f557, %f558, %f559, %f560}, | |
{%r127, %r128, %r129, %r130}, | |
{%r122, %r123}, | |
{%f713, %f714, %f715, %f716}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f561, %f562, %f563, %f564}, | |
{%r127, %r128, %r129, %r130}, | |
{%r124, %r125}, | |
{%f709, %f710, %f711, %f712}; | |
and.b64 %rd435, %rd431, 3; | |
shl.b64 %rd315, %rd435, 11; | |
or.b64 %rd316, %rd315, %rd43; | |
shl.b64 %rd317, %rd316, 2; | |
add.s64 %rd318, %rd60, %rd317; | |
add.s64 %rd319, %rd434, %rd67; | |
add.s64 %rd262, %rd319, 192; | |
selp.b32 %r67, 16, 0, %p1; | |
cvt.u32.u64 %r66, %rd318; | |
// begin inline asm | |
cp.async.cg.shared.global [%r66], [%rd262], 16, %r67; | |
// end inline asm | |
shl.b64 %rd320, %rd435, 13; | |
add.s64 %rd321, %rd44, %rd320; | |
add.s64 %rd263, %rd319, 524480; | |
cvt.u32.u64 %r126, %rd321; | |
add.s32 %r68, %r126, 4096; | |
// begin inline asm | |
cp.async.cg.shared.global [%r68], [%rd263], 16, %r67; | |
// end inline asm | |
shl.b64 %rd322, %rd435, 12; | |
or.b64 %rd323, %rd322, %rd45; | |
shl.b64 %rd324, %rd323, 2; | |
add.s64 %rd325, %rd61, %rd324; | |
add.s64 %rd326, %rd432, %rd67; | |
add.s64 %rd264, %rd326, 196608; | |
cvt.u32.u64 %r70, %rd325; | |
// begin inline asm | |
cp.async.cg.shared.global [%r70], [%rd264], 16, %r67; | |
// end inline asm | |
or.b64 %rd327, %rd322, %rd46; | |
shl.b64 %rd328, %rd327, 2; | |
add.s64 %rd329, %rd62, %rd328; | |
add.s64 %rd265, %rd326, 212992; | |
cvt.u32.u64 %r72, %rd329; | |
// begin inline asm | |
cp.async.cg.shared.global [%r72], [%rd265], 16, %r67; | |
// end inline asm | |
or.b64 %rd330, %rd322, %rd47; | |
shl.b64 %rd331, %rd330, 2; | |
add.s64 %rd332, %rd63, %rd331; | |
add.s64 %rd266, %rd326, 229376; | |
cvt.u32.u64 %r74, %rd332; | |
// begin inline asm | |
cp.async.cg.shared.global [%r74], [%rd266], 16, %r67; | |
// end inline asm | |
or.b64 %rd333, %rd322, %rd48; | |
shl.b64 %rd334, %rd333, 2; | |
add.s64 %rd335, %rd64, %rd334; | |
add.s64 %rd267, %rd326, 245760; | |
cvt.u32.u64 %r76, %rd335; | |
// begin inline asm | |
cp.async.cg.shared.global [%r76], [%rd267], 16, %r67; | |
// end inline asm | |
cp.async.commit_group; | |
cp.async.wait_group 2; | |
bar.sync 0; | |
shl.b64 %rd336, %rd75, 13; | |
add.s64 %rd337, %rd131, %rd336; | |
add.s64 %rd338, %rd337, %rd212; | |
add.s64 %rd340, %rd338, %rd214; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r139, %r140, %r141, %r142}, [%rd340]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%rd340+1024]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r131, %r132, %r133, %r134}, [%rd340+2048]; | |
ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r127, %r128, %r129, %r130}, [%rd340+3072]; | |
shl.b64 %rd341, %rd75, 12; | |
or.b64 %rd342, %rd341, %rd58; | |
shl.b64 %rd343, %rd342, 2; | |
add.s64 %rd344, %rd139, %rd343; | |
add.s64 %rd346, %rd344, %rd218; | |
ld.shared.f32 %f708, [%rd346]; | |
or.b64 %rd347, %rd341, %rd59; | |
shl.b64 %rd348, %rd347, 2; | |
add.s64 %rd349, %rd139, %rd348; | |
add.s64 %rd351, %rd349, %rd222; | |
ld.shared.f32 %f707, [%rd351]; | |
add.s64 %rd353, %rd344, %rd224; | |
ld.shared.f32 %f706, [%rd353]; | |
add.s64 %rd355, %rd349, %rd226; | |
ld.shared.f32 %f705, [%rd355]; | |
add.s64 %rd357, %rd344, %rd228; | |
ld.shared.f32 %f704, [%rd357]; | |
add.s64 %rd359, %rd349, %rd230; | |
ld.shared.f32 %f703, [%rd359]; | |
add.s64 %rd361, %rd344, %rd232; | |
ld.shared.f32 %f702, [%rd361]; | |
add.s64 %rd363, %rd349, %rd234; | |
ld.shared.f32 %f701, [%rd363]; | |
add.s64 %rd365, %rd344, %rd236; | |
ld.shared.f32 %f700, [%rd365]; | |
add.s64 %rd367, %rd349, %rd238; | |
ld.shared.f32 %f699, [%rd367]; | |
add.s64 %rd369, %rd344, %rd240; | |
ld.shared.f32 %f698, [%rd369]; | |
add.s64 %rd371, %rd349, %rd242; | |
ld.shared.f32 %f697, [%rd371]; | |
add.s64 %rd373, %rd344, %rd244; | |
ld.shared.f32 %f696, [%rd373]; | |
add.s64 %rd375, %rd349, %rd246; | |
ld.shared.f32 %f695, [%rd375]; | |
add.s64 %rd377, %rd344, %rd248; | |
ld.shared.f32 %f694, [%rd377]; | |
add.s64 %rd379, %rd349, %rd250; | |
ld.shared.f32 %f693, [%rd379]; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f833, %f834, %f835, %f836}, | |
{%r78, %r79, %r80, %r81}, | |
{%r94, %r95}, | |
{%f437, %f438, %f439, %f440}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f829, %f830, %f831, %f832}, | |
{%r78, %r79, %r80, %r81}, | |
{%r96, %r97}, | |
{%f441, %f442, %f443, %f444}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f825, %f826, %f827, %f828}, | |
{%r78, %r79, %r80, %r81}, | |
{%r98, %r99}, | |
{%f445, %f446, %f447, %f448}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f821, %f822, %f823, %f824}, | |
{%r78, %r79, %r80, %r81}, | |
{%r100, %r101}, | |
{%f449, %f450, %f451, %f452}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f817, %f818, %f819, %f820}, | |
{%r78, %r79, %r80, %r81}, | |
{%r102, %r103}, | |
{%f453, %f454, %f455, %f456}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f813, %f814, %f815, %f816}, | |
{%r78, %r79, %r80, %r81}, | |
{%r104, %r105}, | |
{%f457, %f458, %f459, %f460}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f809, %f810, %f811, %f812}, | |
{%r78, %r79, %r80, %r81}, | |
{%r106, %r107}, | |
{%f461, %f462, %f463, %f464}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f805, %f806, %f807, %f808}, | |
{%r78, %r79, %r80, %r81}, | |
{%r108, %r109}, | |
{%f465, %f466, %f467, %f468}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f801, %f802, %f803, %f804}, | |
{%r82, %r83, %r84, %r85}, | |
{%r94, %r95}, | |
{%f469, %f470, %f471, %f472}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f797, %f798, %f799, %f800}, | |
{%r82, %r83, %r84, %r85}, | |
{%r96, %r97}, | |
{%f473, %f474, %f475, %f476}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f793, %f794, %f795, %f796}, | |
{%r82, %r83, %r84, %r85}, | |
{%r98, %r99}, | |
{%f477, %f478, %f479, %f480}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f789, %f790, %f791, %f792}, | |
{%r82, %r83, %r84, %r85}, | |
{%r100, %r101}, | |
{%f481, %f482, %f483, %f484}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f785, %f786, %f787, %f788}, | |
{%r82, %r83, %r84, %r85}, | |
{%r102, %r103}, | |
{%f485, %f486, %f487, %f488}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f781, %f782, %f783, %f784}, | |
{%r82, %r83, %r84, %r85}, | |
{%r104, %r105}, | |
{%f489, %f490, %f491, %f492}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f777, %f778, %f779, %f780}, | |
{%r82, %r83, %r84, %r85}, | |
{%r106, %r107}, | |
{%f493, %f494, %f495, %f496}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f773, %f774, %f775, %f776}, | |
{%r82, %r83, %r84, %r85}, | |
{%r108, %r109}, | |
{%f497, %f498, %f499, %f500}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f769, %f770, %f771, %f772}, | |
{%r86, %r87, %r88, %r89}, | |
{%r94, %r95}, | |
{%f501, %f502, %f503, %f504}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f765, %f766, %f767, %f768}, | |
{%r86, %r87, %r88, %r89}, | |
{%r96, %r97}, | |
{%f505, %f506, %f507, %f508}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f761, %f762, %f763, %f764}, | |
{%r86, %r87, %r88, %r89}, | |
{%r98, %r99}, | |
{%f509, %f510, %f511, %f512}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f757, %f758, %f759, %f760}, | |
{%r86, %r87, %r88, %r89}, | |
{%r100, %r101}, | |
{%f513, %f514, %f515, %f516}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f753, %f754, %f755, %f756}, | |
{%r86, %r87, %r88, %r89}, | |
{%r102, %r103}, | |
{%f517, %f518, %f519, %f520}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f749, %f750, %f751, %f752}, | |
{%r86, %r87, %r88, %r89}, | |
{%r104, %r105}, | |
{%f521, %f522, %f523, %f524}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f745, %f746, %f747, %f748}, | |
{%r86, %r87, %r88, %r89}, | |
{%r106, %r107}, | |
{%f525, %f526, %f527, %f528}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f741, %f742, %f743, %f744}, | |
{%r86, %r87, %r88, %r89}, | |
{%r108, %r109}, | |
{%f529, %f530, %f531, %f532}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f737, %f738, %f739, %f740}, | |
{%r90, %r91, %r92, %r93}, | |
{%r94, %r95}, | |
{%f533, %f534, %f535, %f536}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f733, %f734, %f735, %f736}, | |
{%r90, %r91, %r92, %r93}, | |
{%r96, %r97}, | |
{%f537, %f538, %f539, %f540}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f729, %f730, %f731, %f732}, | |
{%r90, %r91, %r92, %r93}, | |
{%r98, %r99}, | |
{%f541, %f542, %f543, %f544}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f725, %f726, %f727, %f728}, | |
{%r90, %r91, %r92, %r93}, | |
{%r100, %r101}, | |
{%f545, %f546, %f547, %f548}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f721, %f722, %f723, %f724}, | |
{%r90, %r91, %r92, %r93}, | |
{%r102, %r103}, | |
{%f549, %f550, %f551, %f552}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f717, %f718, %f719, %f720}, | |
{%r90, %r91, %r92, %r93}, | |
{%r104, %r105}, | |
{%f553, %f554, %f555, %f556}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f713, %f714, %f715, %f716}, | |
{%r90, %r91, %r92, %r93}, | |
{%r106, %r107}, | |
{%f557, %f558, %f559, %f560}; | |
mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 | |
{%f709, %f710, %f711, %f712}, | |
{%r90, %r91, %r92, %r93}, | |
{%r108, %r109}, | |
{%f561, %f562, %f563, %f564}; | |
add.s64 %rd434, %rd434, 64; | |
add.s64 %rd432, %rd432, 65536; | |
add.s64 %rd431, %rd431, 1; | |
setp.lt.u64 %p2, %rd433, 2032; | |
mov.u64 %rd437, %rd75; | |
@%p2 bra $L__BB0_1; | |
add.s64 %rd380, %rd5, %rd9; | |
shl.b64 %rd381, %rd9, 3; | |
sub.s64 %rd382, %rd6, %rd381; | |
add.s64 %rd383, %rd382, %rd10; | |
cp.async.commit_group; | |
cp.async.wait_group 0; | |
bar.sync 0; | |
shl.b64 %rd384, %rd380, 10; | |
add.s64 %rd386, %rd131, %rd384; | |
shl.b64 %rd387, %rd383, 2; | |
add.s64 %rd388, %rd386, %rd387; | |
st.shared.v2.f32 [%rd388+49376], {%f709, %f710}; | |
st.shared.v2.f32 [%rd388+57568], {%f711, %f712}; | |
st.shared.v2.f32 [%rd388+49344], {%f713, %f714}; | |
st.shared.v2.f32 [%rd388+57536], {%f715, %f716}; | |
st.shared.v2.f32 [%rd388+49312], {%f717, %f718}; | |
st.shared.v2.f32 [%rd388+57504], {%f719, %f720}; | |
st.shared.v2.f32 [%rd388+49280], {%f721, %f722}; | |
st.shared.v2.f32 [%rd388+57472], {%f723, %f724}; | |
st.shared.v2.f32 [%rd388+49248], {%f725, %f726}; | |
st.shared.v2.f32 [%rd388+57440], {%f727, %f728}; | |
st.shared.v2.f32 [%rd388+49216], {%f729, %f730}; | |
st.shared.v2.f32 [%rd388+57408], {%f731, %f732}; | |
st.shared.v2.f32 [%rd388+49184], {%f733, %f734}; | |
st.shared.v2.f32 [%rd388+57376], {%f735, %f736}; | |
st.shared.v2.f32 [%rd388+49152], {%f737, %f738}; | |
st.shared.v2.f32 [%rd388+57344], {%f739, %f740}; | |
st.shared.v2.f32 [%rd388+32992], {%f741, %f742}; | |
st.shared.v2.f32 [%rd388+41184], {%f743, %f744}; | |
st.shared.v2.f32 [%rd388+32960], {%f745, %f746}; | |
st.shared.v2.f32 [%rd388+41152], {%f747, %f748}; | |
st.shared.v2.f32 [%rd388+32928], {%f749, %f750}; | |
st.shared.v2.f32 [%rd388+41120], {%f751, %f752}; | |
st.shared.v2.f32 [%rd388+32896], {%f753, %f754}; | |
st.shared.v2.f32 [%rd388+41088], {%f755, %f756}; | |
st.shared.v2.f32 [%rd388+32864], {%f757, %f758}; | |
st.shared.v2.f32 [%rd388+41056], {%f759, %f760}; | |
st.shared.v2.f32 [%rd388+32832], {%f761, %f762}; | |
st.shared.v2.f32 [%rd388+41024], {%f763, %f764}; | |
st.shared.v2.f32 [%rd388+32800], {%f765, %f766}; | |
st.shared.v2.f32 [%rd388+40992], {%f767, %f768}; | |
st.shared.v2.f32 [%rd388+32768], {%f769, %f770}; | |
st.shared.v2.f32 [%rd388+40960], {%f771, %f772}; | |
st.shared.v2.f32 [%rd388+16608], {%f773, %f774}; | |
st.shared.v2.f32 [%rd388+24800], {%f775, %f776}; | |
st.shared.v2.f32 [%rd388+16576], {%f777, %f778}; | |
st.shared.v2.f32 [%rd388+24768], {%f779, %f780}; | |
st.shared.v2.f32 [%rd388+16544], {%f781, %f782}; | |
st.shared.v2.f32 [%rd388+24736], {%f783, %f784}; | |
st.shared.v2.f32 [%rd388+16512], {%f785, %f786}; | |
st.shared.v2.f32 [%rd388+24704], {%f787, %f788}; | |
st.shared.v2.f32 [%rd388+16480], {%f789, %f790}; | |
st.shared.v2.f32 [%rd388+24672], {%f791, %f792}; | |
st.shared.v2.f32 [%rd388+16448], {%f793, %f794}; | |
st.shared.v2.f32 [%rd388+24640], {%f795, %f796}; | |
st.shared.v2.f32 [%rd388+16416], {%f797, %f798}; | |
st.shared.v2.f32 [%rd388+24608], {%f799, %f800}; | |
st.shared.v2.f32 [%rd388+16384], {%f801, %f802}; | |
st.shared.v2.f32 [%rd388+24576], {%f803, %f804}; | |
st.shared.v2.f32 [%rd388+224], {%f805, %f806}; | |
st.shared.v2.f32 [%rd388+8416], {%f807, %f808}; | |
st.shared.v2.f32 [%rd388+192], {%f809, %f810}; | |
st.shared.v2.f32 [%rd388+8384], {%f811, %f812}; | |
st.shared.v2.f32 [%rd388+160], {%f813, %f814}; | |
st.shared.v2.f32 [%rd388+8352], {%f815, %f816}; | |
st.shared.v2.f32 [%rd388+128], {%f817, %f818}; | |
st.shared.v2.f32 [%rd388+8320], {%f819, %f820}; | |
st.shared.v2.f32 [%rd388+96], {%f821, %f822}; | |
st.shared.v2.f32 [%rd388+8288], {%f823, %f824}; | |
st.shared.v2.f32 [%rd388+64], {%f825, %f826}; | |
st.shared.v2.f32 [%rd388+8256], {%f827, %f828}; | |
st.shared.v2.f32 [%rd388+32], {%f829, %f830}; | |
st.shared.v2.f32 [%rd388+8224], {%f831, %f832}; | |
st.shared.v2.f32 [%rd388], {%f833, %f834}; | |
st.shared.v2.f32 [%rd388+8192], {%f835, %f836}; | |
bar.sync 0; | |
shl.b64 %rd389, %rd45, 2; | |
add.s64 %rd390, %rd131, %rd389; | |
add.s64 %rd392, %rd390, %rd141; | |
ld.shared.v4.f32 {%f565, %f566, %f567, %f568}, [%rd392]; | |
or.b64 %rd393, %rd4, %rd1; | |
shl.b64 %rd394, %rd393, 12; | |
add.s64 %rd395, %rd82, %rd394; | |
add.s64 %rd397, %rd395, %rd147; | |
st.global.v4.f32 [%rd397], {%f565, %f566, %f567, %f568}; | |
shl.b64 %rd398, %rd46, 2; | |
add.s64 %rd399, %rd131, %rd398; | |
add.s64 %rd400, %rd399, %rd141; | |
ld.shared.v4.f32 {%f569, %f570, %f571, %f572}, [%rd400]; | |
st.global.v4.f32 [%rd397+16384], {%f569, %f570, %f571, %f572}; | |
shl.b64 %rd401, %rd47, 2; | |
add.s64 %rd402, %rd131, %rd401; | |
add.s64 %rd403, %rd402, %rd141; | |
ld.shared.v4.f32 {%f573, %f574, %f575, %f576}, [%rd403]; | |
st.global.v4.f32 [%rd397+32768], {%f573, %f574, %f575, %f576}; | |
shl.b64 %rd404, %rd48, 2; | |
add.s64 %rd405, %rd131, %rd404; | |
add.s64 %rd406, %rd405, %rd141; | |
ld.shared.v4.f32 {%f577, %f578, %f579, %f580}, [%rd406]; | |
st.global.v4.f32 [%rd397+49152], {%f577, %f578, %f579, %f580}; | |
shl.b64 %rd407, %rd49, 10; | |
add.s64 %rd408, %rd131, %rd407; | |
add.s64 %rd409, %rd408, %rd141; | |
ld.shared.v4.f32 {%f581, %f582, %f583, %f584}, [%rd409]; | |
st.global.v4.f32 [%rd397+65536], {%f581, %f582, %f583, %f584}; | |
shl.b64 %rd410, %rd50, 10; | |
add.s64 %rd411, %rd131, %rd410; | |
add.s64 %rd412, %rd411, %rd141; | |
ld.shared.v4.f32 {%f585, %f586, %f587, %f588}, [%rd412]; | |
st.global.v4.f32 [%rd397+81920], {%f585, %f586, %f587, %f588}; | |
shl.b64 %rd413, %rd51, 10; | |
add.s64 %rd414, %rd131, %rd413; | |
add.s64 %rd415, %rd414, %rd141; | |
ld.shared.v4.f32 {%f589, %f590, %f591, %f592}, [%rd415]; | |
st.global.v4.f32 [%rd397+98304], {%f589, %f590, %f591, %f592}; | |
shl.b64 %rd416, %rd52, 10; | |
add.s64 %rd417, %rd131, %rd416; | |
add.s64 %rd418, %rd417, %rd141; | |
ld.shared.v4.f32 {%f593, %f594, %f595, %f596}, [%rd418]; | |
st.global.v4.f32 [%rd397+114688], {%f593, %f594, %f595, %f596}; | |
shl.b64 %rd419, %rd53, 10; | |
add.s64 %rd420, %rd131, %rd419; | |
add.s64 %rd421, %rd420, %rd141; | |
ld.shared.v4.f32 {%f597, %f598, %f599, %f600}, [%rd421]; | |
st.global.v4.f32 [%rd397+131072], {%f597, %f598, %f599, %f600}; | |
shl.b64 %rd422, %rd54, 10; | |
add.s64 %rd423, %rd131, %rd422; | |
add.s64 %rd424, %rd423, %rd141; | |
ld.shared.v4.f32 {%f601, %f602, %f603, %f604}, [%rd424]; | |
st.global.v4.f32 [%rd397+147456], {%f601, %f602, %f603, %f604}; | |
shl.b64 %rd425, %rd55, 10; | |
add.s64 %rd426, %rd131, %rd425; | |
add.s64 %rd427, %rd426, %rd141; | |
ld.shared.v4.f32 {%f605, %f606, %f607, %f608}, [%rd427]; | |
st.global.v4.f32 [%rd397+163840], {%f605, %f606, %f607, %f608}; | |
shl.b64 %rd428, %rd56, 10; | |
add.s64 %rd429, %rd131, %rd428; | |
add.s64 %rd430, %rd429, %rd141; | |
ld.shared.v4.f32 {%f609, %f610, %f611, %f612}, [%rd430]; | |
st.global.v4.f32 [%rd397+180224], {%f609, %f610, %f611, %f612}; | |
ld.shared.v4.f32 {%f613, %f614, %f615, %f616}, [%rd392+49152]; | |
st.global.v4.f32 [%rd397+196608], {%f613, %f614, %f615, %f616}; | |
ld.shared.v4.f32 {%f617, %f618, %f619, %f620}, [%rd392+53248]; | |
st.global.v4.f32 [%rd397+212992], {%f617, %f618, %f619, %f620}; | |
ld.shared.v4.f32 {%f621, %f622, %f623, %f624}, [%rd392+57344]; | |
st.global.v4.f32 [%rd397+229376], {%f621, %f622, %f623, %f624}; | |
ld.shared.v4.f32 {%f625, %f626, %f627, %f628}, [%rd392+61440]; | |
st.global.v4.f32 [%rd397+245760], {%f625, %f626, %f627, %f628}; | |
ld.shared.v4.f32 {%f629, %f630, %f631, %f632}, [%rd392+65536]; | |
st.global.v4.f32 [%rd397+262144], {%f629, %f630, %f631, %f632}; | |
ld.shared.v4.f32 {%f633, %f634, %f635, %f636}, [%rd392+69632]; | |
st.global.v4.f32 [%rd397+278528], {%f633, %f634, %f635, %f636}; | |
ld.shared.v4.f32 {%f637, %f638, %f639, %f640}, [%rd392+73728]; | |
st.global.v4.f32 [%rd397+294912], {%f637, %f638, %f639, %f640}; | |
ld.shared.v4.f32 {%f641, %f642, %f643, %f644}, [%rd392+77824]; | |
st.global.v4.f32 [%rd397+311296], {%f641, %f642, %f643, %f644}; | |
ld.shared.v4.f32 {%f645, %f646, %f647, %f648}, [%rd392+81920]; | |
st.global.v4.f32 [%rd397+327680], {%f645, %f646, %f647, %f648}; | |
ld.shared.v4.f32 {%f649, %f650, %f651, %f652}, [%rd392+86016]; | |
st.global.v4.f32 [%rd397+344064], {%f649, %f650, %f651, %f652}; | |
ld.shared.v4.f32 {%f653, %f654, %f655, %f656}, [%rd392+90112]; | |
st.global.v4.f32 [%rd397+360448], {%f653, %f654, %f655, %f656}; | |
ld.shared.v4.f32 {%f657, %f658, %f659, %f660}, [%rd392+94208]; | |
st.global.v4.f32 [%rd397+376832], {%f657, %f658, %f659, %f660}; | |
ld.shared.v4.f32 {%f661, %f662, %f663, %f664}, [%rd392+98304]; | |
st.global.v4.f32 [%rd397+393216], {%f661, %f662, %f663, %f664}; | |
ld.shared.v4.f32 {%f665, %f666, %f667, %f668}, [%rd392+102400]; | |
st.global.v4.f32 [%rd397+409600], {%f665, %f666, %f667, %f668}; | |
ld.shared.v4.f32 {%f669, %f670, %f671, %f672}, [%rd392+106496]; | |
st.global.v4.f32 [%rd397+425984], {%f669, %f670, %f671, %f672}; | |
ld.shared.v4.f32 {%f673, %f674, %f675, %f676}, [%rd392+110592]; | |
st.global.v4.f32 [%rd397+442368], {%f673, %f674, %f675, %f676}; | |
ld.shared.v4.f32 {%f677, %f678, %f679, %f680}, [%rd392+114688]; | |
st.global.v4.f32 [%rd397+458752], {%f677, %f678, %f679, %f680}; | |
ld.shared.v4.f32 {%f681, %f682, %f683, %f684}, [%rd392+118784]; | |
st.global.v4.f32 [%rd397+475136], {%f681, %f682, %f683, %f684}; | |
ld.shared.v4.f32 {%f685, %f686, %f687, %f688}, [%rd392+122880]; | |
st.global.v4.f32 [%rd397+491520], {%f685, %f686, %f687, %f688}; | |
ld.shared.v4.f32 {%f689, %f690, %f691, %f692}, [%rd392+126976]; | |
st.global.v4.f32 [%rd397+507904], {%f689, %f690, %f691, %f692}; | |
bar.sync 0; | |
ret; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hal.executable public @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0 { | |
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) { | |
hal.executable.export public @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3456x2048xf32>> | |
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> | |
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<3456x1024xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3456, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3456x2048xf32>> -> tensor<3456x2048xf32> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32> | |
%5 = tensor.empty() : tensor<3456x1024xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3456x1024xf32>) -> tensor<3456x1024xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<3456x2048xf32>, tensor<2048x1024xf32>) outs(%6 : tensor<3456x1024xf32>) -> tensor<3456x1024xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3456, 1024], strides = [1, 1] : tensor<3456x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<3456x1024xf32>> | |
return | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment