This is the ELF corresponding to the mass matrix. The corresponding program can be found here.
By comparison of the program and kernel we can easily identify the cmpexchg which are 3 in numbers at the end of the file. And so we that starting from .L_10 the iel loop starts, which means that all the floating point operations are in the line .L_10.
`
//--------------------- .text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0 --------------------------
.section .text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=32"
.align 32
.text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0:
.type loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,@function
.size loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,(.L_76 - loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0)
.other loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,@"STO_CUDA_ENTRY STV_DEFAULT"
loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ MOV R0, RZ;
.L_10:
/*0018*/ { XMAD R2, R0.reuse, 0x3, RZ;
/*0028*/ SSY `(.L_4); }
/*0030*/ XMAD.PSL R2, R0.H1, 0x3, R2;
/*0038*/ SHL R3, R2.reuse, 0x2;
/*0048*/ SHR R2, R2, 0x1e;
/*0050*/ IADD R4.CC, R3, c[0x0][0x168];
/*0058*/ IADD.X R5, R2, c[0x0][0x16c];
/*0068*/ LDG.E.CI R9, [R4];
/*0070*/ LDG.E.CI R10, [R4+0x4];
/*0078*/ LDG.E.CI R11, [R4+0x8];
/*0088*/ IADD R6.CC, R3, c[0x0][0x150];
/*0090*/ IADD.X R3, R2, c[0x0][0x154];
/*0098*/ MOV R2, R6;
/*00a8*/ LDG.E.CI R7, [R2];
/*00b0*/ LDG.E.CI R6, [R2+0x8];
/*00b8*/ LDG.E.CI R8, [R2+0x4];
/*00c8*/ SHL R9, R9, 0x1;
/*00d0*/ ISCADD R30.CC, R9.reuse, c[0x0][0x160], 0x3;
/*00d8*/ SHR R9, R9, 0x1d;
/*00e8*/ SHL R10, R10, 0x1;
/*00f0*/ IADD.X R31, R9, c[0x0][0x164];
/*00f8*/ { ISCADD R28.CC, R10.reuse, c[0x0][0x160], 0x3;
/*0108*/ LDG.E.CI.64 R16, [R30]; }
/*0110*/ { SHR R10, R10, 0x1d;
/*0118*/ LDG.E.CI.64 R14, [R30+0x8]; }
/*0128*/ DEPBAR.LE SB5, 0x3;
/*0130*/ SHL R11, R11, 0x1;
/*0138*/ IADD.X R29, R10, c[0x0][0x164];
/*0148*/ { ISCADD R12.CC, R11.reuse, c[0x0][0x160], 0x3;
/*0150*/ LDG.E.CI.64 R24, [R28+0x8]; }
/*0158*/ { SHR R11, R11, 0x1d;
/*0168*/ LDG.E.CI.64 R18, [R28]; }
/*0170*/ DEPBAR.LE SB5, 0x3, {0};
/*0178*/ SHL R4, R7.reuse, 0x3;
/*0188*/ SHR R5, R7, 0x1d;
/*0190*/ IADD.X R13, R11, c[0x0][0x164];
/*0198*/ LDG.E.CI.64 R22, [R12];
/*01a8*/ { IADD R10.CC, R4, c[0x0][0x148];
/*01b0*/ LDG.E.CI.64 R20, [R12+0x8]; }
/*01b8*/ { IADD.X R7, R5, c[0x0][0x14c];
/*01c8*/ DEPBAR.LE SB5, 0x3; }
/*01d0*/ ISCADD R9.CC, R6.reuse, c[0x0][0x148], 0x3;
/*01d8*/ SHR R6, R6, 0x1d;
/*01e8*/ IADD.X R11, R6, c[0x0][0x14c];
/*01f0*/ MOV R6, R10;
/*01f8*/ LDG.E.CI.64 R6, [R6];
/*0208*/ DEPBAR.LE SB5, 0x3;
/*0210*/ ISCADD R26.CC, R8.reuse, c[0x0][0x148], 0x3;
/*0218*/ MOV R10, R9;
/*0228*/ { SHR R8, R8, 0x1d;
/*0230*/ LDG.E.CI.64 R10, [R10]; }
/*0238*/ IADD.X R27, R8, c[0x0][0x14c];
/*0248*/ LDG.E.CI.64 R12, [R26];
/*0250*/ IADD R8.CC, R4, c[0x0][0x170];
/*0258*/ IADD.X R9, R5, c[0x0][0x174];
/*0268*/ LDG.E.CV.64 R4, [R8];
/*0270*/ DADD R24, -R14, R24;
/*0278*/ DADD R22, -R16, R22;
/*0288*/ DEPBAR.LE SB5, 0x4, {3};
/*0290*/ DADD R16, R18, -R16;
/*0298*/ DADD R14, R20, -R14;
/*02a8*/ DMUL R22, R22, -R24;
/*02b0*/ DFMA R14, R16, R14, R22;
/*02b8*/ F2F.F64.F64 R14, |R14|;
/*02c8*/ DEPBAR.LE SB5, 0x2;
/*02d0*/ DMUL R16, R6, c[0x3][0x0];
/*02d8*/ DMUL R20, R6.reuse, c[0x3][0x18];
/*02e8*/ DMUL R24, R6, c[0x3][0x30];
/*02f0*/ DFMA R18, R10.reuse, c[0x3][0x10], R16;
/*02f8*/ { DFMA R6, R10.reuse, c[0x3][0x28], R20;
/*0308*/ DEPBAR.LE SB5, 0x1, {4}; }
/*0310*/ DFMA R16, R12, c[0x3][0x8], R18;
/*0318*/ DFMA R10, R10, c[0x3][0x40], R24;
/*0328*/ DFMA R18, R12, c[0x3][0x20], R6;
/*0330*/ DMUL R6, R16, c[0x3][0x0];
/*0338*/ DFMA R12, R12, c[0x3][0x38], R10;
/*0348*/ DMUL R20, R6, c[0x3][0x48];
/*0350*/ DMUL R10, R18, c[0x3][0x18];
/*0358*/ DMUL R6, R12, c[0x3][0x30];
/*0368*/ DMUL R10, R10, c[0x3][0x50];
/*0370*/ DFMA R20, R20, R14.reuse, RZ;
/*0378*/ DMUL R6, R6, c[0x3][0x58];
/*0388*/ DFMA R20, R10, R14, R20;
/*0390*/ DFMA R10, R6, R14, R20;
/*0398*/ DADD R6, R10, R4;
/*03a8*/ ATOM.E.CAS.64 R6, [R8], R4, R6;
/*03b0*/ DMUL R20, R16, c[0x3][0x8];
/*03b8*/ DMUL R22, R16, c[0x3][0x10];
/*03c8*/ DMUL R16, R18.reuse, c[0x3][0x20];
/*03d0*/ DMUL R18, R18, c[0x3][0x28];
/*03d8*/ DMUL R20, R20, c[0x3][0x48];
/*03e8*/ DMUL R24, R22, c[0x3][0x48];
/*03f0*/ DMUL R16, R16, c[0x3][0x50];
/*03f8*/ DMUL R18, R18, c[0x3][0x50];
/*0408*/ DFMA R20, R20, R14.reuse, RZ;
/*0410*/ DFMA R24, R24, R14, RZ;
/*0418*/ DMUL R8, R12.reuse, c[0x3][0x38];
/*0428*/ DMUL R12, R12, c[0x3][0x40];
/*0430*/ IADD32I R0, R0, 0x1;
/*0438*/ DFMA R16, R16, R14.reuse, R20;
/*0448*/ DFMA R24, R18, R14.reuse, R24;
/*0450*/ DMUL R8, R8, c[0x3][0x58];
/*0458*/ DMUL R12, R12, c[0x3][0x58];
/*0468*/ ISETP.LT.AND P0, PT, R0, 0x800, PT;
/*0470*/ DFMA R18, R8, R14, R16;
/*0478*/ DFMA R14, R12, R14, R24;
/*0488*/ IADD RZ.CC, -R4, R6;
/*0490*/ ISETP.EQ.X.AND P1, PT, R7, R5, PT;
/*0498*/ @P1 SYNC (*"TARGET= .L_4 "*);
/*04a8*/ LDG.E.CI R4, [R2];
/*04b0*/ ISCADD R8.CC, R4.reuse, c[0x0][0x170], 0x3;
/*04b8*/ SHR R4, R4, 0x1d;
/*04c8*/ IADD.X R9, R4, c[0x0][0x174];
.L_5:
/*04d0*/ MOV R4, R8;
/*04d8*/ MOV R5, R9;
/*04e8*/ LDG.E.CV.64 R4, [R4];
/*04f0*/ DADD R6, R10, R4;
/*04f8*/ ATOM.E.CAS.64 R6, [R8], R4, R6;
/*0508*/ IADD RZ.CC, -R4, R6;
/*0510*/ ISETP.EQ.X.AND P1, PT, R7, R5, PT;
/*0518*/ @!P1 BRA `(.L_5);
/*0528*/ SYNC (*"TARGET= .L_4 "*);
.L_4:
/*0530*/ LDG.E.CI R4, [R2+0x4];
/*0538*/ SSY `(.L_6);
/*0548*/ ISCADD R8.CC, R4.reuse, c[0x0][0x170], 0x3;
/*0550*/ SHR R4, R4, 0x1d;
/*0558*/ IADD.X R9, R4, c[0x0][0x174];
.L_7:
/*0568*/ MOV R4, R8;
/*0570*/ MOV R5, R9;
/*0578*/ LDG.E.CV.64 R4, [R4];
/*0588*/ DADD R6, R18, R4;
/*0590*/ ATOM.E.CAS.64 R6, [R8], R4, R6;
/*0598*/ IADD RZ.CC, -R4, R6;
/*05a8*/ ISETP.NE.X.AND P1, PT, R7, R5, PT;
/*05b0*/ @P1 BRA `(.L_7);
/*05b8*/ SYNC (*"TARGET= .L_6 "*);
.L_6:
/*05c8*/ LDG.E.CI R2, [R2+0x8];
/*05d0*/ SSY `(.L_8);
/*05d8*/ ISCADD R8.CC, R2.reuse, c[0x0][0x170], 0x3;
/*05e8*/ SHR R4, R2, 0x1d;
/*05f0*/ IADD.X R9, R4, c[0x0][0x174];
.L_9:
/*05f8*/ MOV R2, R8;
/*0608*/ MOV R3, R9;
/*0610*/ LDG.E.CV.64 R4, [R2];
/*0618*/ DADD R6, R14, R4;
/*0628*/ ATOM.E.CAS.64 R6, [R8], R4, R6;
/*0630*/ IADD RZ.CC, -R4, R6;
/*0638*/ ISETP.NE.X.AND P1, PT, R7, R5, PT;
/*0648*/ @P1 BRA `(.L_9);
/*0650*/ SYNC (*"TARGET= .L_8 "*);
.L_8:
/*0658*/ @P0 BRA `(.L_10);
/*0668*/ EXIT;
.L_11:
/*0670*/ BRA `(.L_11);
.L_76:
Therefore, below is the kernel which would need all the FLOPs
/*0018*/ { XMAD R2, R0.reuse, 0x3, RZ;
/*0028*/ SSY `(.L_4); }
/*0030*/ XMAD.PSL R2, R0.H1, 0x3, R2;
/*0038*/ SHL R3, R2.reuse, 0x2;
/*0048*/ SHR R2, R2, 0x1e;
/*0050*/ IADD R4.CC, R3, c[0x0][0x168];
/*0058*/ IADD.X R5, R2, c[0x0][0x16c];
/*0068*/ LDG.E.CI R9, [R4];
/*0070*/ LDG.E.CI R10, [R4+0x4];
/*0078*/ LDG.E.CI R11, [R4+0x8];
/*0088*/ IADD R6.CC, R3, c[0x0][0x150];
/*0090*/ IADD.X R3, R2, c[0x0][0x154];
/*0098*/ MOV R2, R6;
/*00a8*/ LDG.E.CI R7, [R2];
/*00b0*/ LDG.E.CI R6, [R2+0x8];
/*00b8*/ LDG.E.CI R8, [R2+0x4];
/*00c8*/ SHL R9, R9, 0x1;
/*00d0*/ ISCADD R30.CC, R9.reuse, c[0x0][0x160], 0x3;
/*00d8*/ SHR R9, R9, 0x1d;
/*00e8*/ SHL R10, R10, 0x1;
/*00f0*/ IADD.X R31, R9, c[0x0][0x164];
/*00f8*/ { ISCADD R28.CC, R10.reuse, c[0x0][0x160], 0x3;
/*0108*/ LDG.E.CI.64 R16, [R30]; }
/*0110*/ { SHR R10, R10, 0x1d;
/*0118*/ LDG.E.CI.64 R14, [R30+0x8]; }
/*0128*/ DEPBAR.LE SB5, 0x3;
/*0130*/ SHL R11, R11, 0x1;
/*0138*/ IADD.X R29, R10, c[0x0][0x164];
/*0148*/ { ISCADD R12.CC, R11.reuse, c[0x0][0x160], 0x3;
/*0150*/ LDG.E.CI.64 R24, [R28+0x8]; }
/*0158*/ { SHR R11, R11, 0x1d;
/*0168*/ LDG.E.CI.64 R18, [R28]; }
/*0170*/ DEPBAR.LE SB5, 0x3, {0};
/*0178*/ SHL R4, R7.reuse, 0x3;
/*0188*/ SHR R5, R7, 0x1d;
/*0190*/ IADD.X R13, R11, c[0x0][0x164];
/*0198*/ LDG.E.CI.64 R22, [R12];
/*01a8*/ { IADD R10.CC, R4, c[0x0][0x148];
/*01b0*/ LDG.E.CI.64 R20, [R12+0x8]; }
/*01b8*/ { IADD.X R7, R5, c[0x0][0x14c];
/*01c8*/ DEPBAR.LE SB5, 0x3; }
/*01d0*/ ISCADD R9.CC, R6.reuse, c[0x0][0x148], 0x3;
/*01d8*/ SHR R6, R6, 0x1d;
/*01e8*/ IADD.X R11, R6, c[0x0][0x14c];
/*01f0*/ MOV R6, R10;
/*01f8*/ LDG.E.CI.64 R6, [R6];
/*0208*/ DEPBAR.LE SB5, 0x3;
/*0210*/ ISCADD R26.CC, R8.reuse, c[0x0][0x148], 0x3;
/*0218*/ MOV R10, R9;
/*0228*/ { SHR R8, R8, 0x1d;
/*0230*/ LDG.E.CI.64 R10, [R10]; }
/*0238*/ IADD.X R27, R8, c[0x0][0x14c];
/*0248*/ LDG.E.CI.64 R12, [R26];
/*0250*/ IADD R8.CC, R4, c[0x0][0x170];
/*0258*/ IADD.X R9, R5, c[0x0][0x174];
/*0268*/ LDG.E.CV.64 R4, [R8];
/*0270*/ DADD R24, -R14, R24;
/*0278*/ DADD R22, -R16, R22;
/*0288*/ DEPBAR.LE SB5, 0x4, {3};
/*0290*/ DADD R16, R18, -R16;
/*0298*/ DADD R14, R20, -R14;
/*02a8*/ DMUL R22, R22, -R24;
/*02b0*/ DFMA R14, R16, R14, R22;
/*02b8*/ F2F.F64.F64 R14, |R14|;
/*02c8*/ DEPBAR.LE SB5, 0x2;
/*02d0*/ DMUL R16, R6, c[0x3][0x0];
/*02d8*/ DMUL R20, R6.reuse, c[0x3][0x18];
/*02e8*/ DMUL R24, R6, c[0x3][0x30];
/*02f0*/ DFMA R18, R10.reuse, c[0x3][0x10], R16;
/*02f8*/ { DFMA R6, R10.reuse, c[0x3][0x28], R20;
/*0308*/ DEPBAR.LE SB5, 0x1, {4}; }
/*0310*/ DFMA R16, R12, c[0x3][0x8], R18;
/*0318*/ DFMA R10, R10, c[0x3][0x40], R24;
/*0328*/ DFMA R18, R12, c[0x3][0x20], R6;
/*0330*/ DMUL R6, R16, c[0x3][0x0];
/*0338*/ DFMA R12, R12, c[0x3][0x38], R10;
/*0348*/ DMUL R20, R6, c[0x3][0x48];
/*0350*/ DMUL R10, R18, c[0x3][0x18];
/*0358*/ DMUL R6, R12, c[0x3][0x30];
/*0368*/ DMUL R10, R10, c[0x3][0x50];
/*0370*/ DFMA R20, R20, R14.reuse, RZ;
/*0378*/ DMUL R6, R6, c[0x3][0x58];
/*0388*/ DFMA R20, R10, R14, R20;
/*0390*/ DFMA R10, R6, R14, R20;
/*0398*/ DADD R6, R10, R4;
/*03a8*/ ATOM.E.CAS.64 R6, [R8], R4, R6;
/*03b0*/ DMUL R20, R16, c[0x3][0x8];
/*03b8*/ DMUL R22, R16, c[0x3][0x10];
/*03c8*/ DMUL R16, R18.reuse, c[0x3][0x20];
/*03d0*/ DMUL R18, R18, c[0x3][0x28];
/*03d8*/ DMUL R20, R20, c[0x3][0x48];
/*03e8*/ DMUL R24, R22, c[0x3][0x48];
/*03f0*/ DMUL R16, R16, c[0x3][0x50];
/*03f8*/ DMUL R18, R18, c[0x3][0x50];
/*0408*/ DFMA R20, R20, R14.reuse, RZ;
/*0410*/ DFMA R24, R24, R14, RZ;
/*0418*/ DMUL R8, R12.reuse, c[0x3][0x38];
/*0428*/ DMUL R12, R12, c[0x3][0x40];
/*0430*/ IADD32I R0, R0, 0x1;
/*0438*/ DFMA R16, R16, R14.reuse, R20;
/*0448*/ DFMA R24, R18, R14.reuse, R24;
/*0450*/ DMUL R8, R8, c[0x3][0x58];
/*0458*/ DMUL R12, R12, c[0x3][0x58];
/*0468*/ ISETP.LT.AND P0, PT, R0, 0x800, PT;
/*0470*/ DFMA R18, R8, R14, R16;
/*0478*/ DFMA R14, R12, R14, R24;
/*0488*/ IADD RZ.CC, -R4, R6;
/*0490*/ ISETP.EQ.X.AND P1, PT, R7, R5, PT;
Running a quick find of DMUL, DADD and DFMA.
DFMA: 16DADD: 8DMUL: 22
Predicted by loopy: 84*nelements.
Total of 46*nelements FLOPs.
This seems reasonable looking at the number of DFMA operations in the above ELF.
The ptxas output:
ptxas info : 0 bytes gmem, 52 bytes cmem[3]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 52 registers, 380 bytes cmem[0], 36 bytes cmem[2]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 8 registers, 380 bytes cmem[0]
Predicted by loopy: 504*nelements.
Calculated by hand, using the output from nvdisasm
- Number of
divs: 8 = 8*(11DMUL+ 11DFMA) - Number
DFMAs: 47 - Number of
DADDs: 6 - Number of
DMULs: 17
Total FlOPS: 738*nelements.
ptxas info : 0 bytes gmem, 148 bytes cmem[3]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
24 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 55 registers, 412 bytes cmem[0], 36 bytes cmem[2]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 11 registers, 412 bytes cmem[0]
It is becoming difficult to calculate the value of FLOPs from the ptx file.
The number reported by Loopy is 462*nelements
ptxas info : 0 bytes gmem, 52 bytes cmem[3]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 72 registers, 388 bytes cmem[0], 56 bytes cmem[2]
ptxas info : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 8 registers, 388 bytes cmem[0]
Reported by Loopy: 48936*nelements
Calculations by hand from the output of nvdisasm:
DFMA: 151DADD: 16DMUL: 52DDIV: 20(Equuivalent to 440 FLOPs)
Therefore Total FLOPs by hand calculation = 1977*nelements