Skip to content

Instantly share code, notes, and snippets.

@kaushikcfd
Last active December 10, 2017 06:22
Show Gist options
  • Save kaushikcfd/f80c32533e2d4e10de3af52ba100b6f0 to your computer and use it in GitHub Desktop.
Save kaushikcfd/f80c32533e2d4e10de3af52ba100b6f0 to your computer and use it in GitHub Desktop.

This is the ELF corresponding to the mass matrix. The corresponding program can be found here. By comparison of the program and kernel we can easily identify the cmpexchg which are 3 in numbers at the end of the file. And so we that starting from .L_10 the iel loop starts, which means that all the floating point operations are in the line .L_10. `

//--------------------- .text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0 --------------------------
	.section	.text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,"ax",@progbits
	.sectioninfo	@"SHI_REGISTERS=32"
	.align	32
.text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0:
        .type           loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,@function
        .size           loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,(.L_76 - loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0)
        .other          loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,@"STO_CUDA_ENTRY STV_DEFAULT"
loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0:
        /*0008*/                   MOV R1, c[0x0][0x20];
        /*0010*/                   MOV R0, RZ;
.L_10:
        /*0018*/         {         XMAD R2, R0.reuse, 0x3, RZ;
        /*0028*/                   SSY `(.L_4);        }
        /*0030*/                   XMAD.PSL R2, R0.H1, 0x3, R2;
        /*0038*/                   SHL R3, R2.reuse, 0x2;
        /*0048*/                   SHR R2, R2, 0x1e;
        /*0050*/                   IADD R4.CC, R3, c[0x0][0x168];
        /*0058*/                   IADD.X R5, R2, c[0x0][0x16c];
        /*0068*/                   LDG.E.CI R9, [R4];
        /*0070*/                   LDG.E.CI R10, [R4+0x4];
        /*0078*/                   LDG.E.CI R11, [R4+0x8];
        /*0088*/                   IADD R6.CC, R3, c[0x0][0x150];
        /*0090*/                   IADD.X R3, R2, c[0x0][0x154];
        /*0098*/                   MOV R2, R6;
        /*00a8*/                   LDG.E.CI R7, [R2];
        /*00b0*/                   LDG.E.CI R6, [R2+0x8];
        /*00b8*/                   LDG.E.CI R8, [R2+0x4];
        /*00c8*/                   SHL R9, R9, 0x1;
        /*00d0*/                   ISCADD R30.CC, R9.reuse, c[0x0][0x160], 0x3;
        /*00d8*/                   SHR R9, R9, 0x1d;
        /*00e8*/                   SHL R10, R10, 0x1;
        /*00f0*/                   IADD.X R31, R9, c[0x0][0x164];
        /*00f8*/         {         ISCADD R28.CC, R10.reuse, c[0x0][0x160], 0x3;
        /*0108*/                   LDG.E.CI.64 R16, [R30];        }
        /*0110*/         {         SHR R10, R10, 0x1d;
        /*0118*/                   LDG.E.CI.64 R14, [R30+0x8];        }
        /*0128*/                   DEPBAR.LE SB5, 0x3;
        /*0130*/                   SHL R11, R11, 0x1;
        /*0138*/                   IADD.X R29, R10, c[0x0][0x164];
        /*0148*/         {         ISCADD R12.CC, R11.reuse, c[0x0][0x160], 0x3;
        /*0150*/                   LDG.E.CI.64 R24, [R28+0x8];        }
        /*0158*/         {         SHR R11, R11, 0x1d;
        /*0168*/                   LDG.E.CI.64 R18, [R28];        }
        /*0170*/                   DEPBAR.LE SB5, 0x3, {0};
        /*0178*/                   SHL R4, R7.reuse, 0x3;
        /*0188*/                   SHR R5, R7, 0x1d;
        /*0190*/                   IADD.X R13, R11, c[0x0][0x164];
        /*0198*/                   LDG.E.CI.64 R22, [R12];
        /*01a8*/         {         IADD R10.CC, R4, c[0x0][0x148];
        /*01b0*/                   LDG.E.CI.64 R20, [R12+0x8];        }
        /*01b8*/         {         IADD.X R7, R5, c[0x0][0x14c];
        /*01c8*/                   DEPBAR.LE SB5, 0x3;        }
        /*01d0*/                   ISCADD R9.CC, R6.reuse, c[0x0][0x148], 0x3;
        /*01d8*/                   SHR R6, R6, 0x1d;
        /*01e8*/                   IADD.X R11, R6, c[0x0][0x14c];
        /*01f0*/                   MOV R6, R10;
        /*01f8*/                   LDG.E.CI.64 R6, [R6];
        /*0208*/                   DEPBAR.LE SB5, 0x3;
        /*0210*/                   ISCADD R26.CC, R8.reuse, c[0x0][0x148], 0x3;
        /*0218*/                   MOV R10, R9;
        /*0228*/         {         SHR R8, R8, 0x1d;
        /*0230*/                   LDG.E.CI.64 R10, [R10];        }
        /*0238*/                   IADD.X R27, R8, c[0x0][0x14c];
        /*0248*/                   LDG.E.CI.64 R12, [R26];
        /*0250*/                   IADD R8.CC, R4, c[0x0][0x170];
        /*0258*/                   IADD.X R9, R5, c[0x0][0x174];
        /*0268*/                   LDG.E.CV.64 R4, [R8];
        /*0270*/                   DADD R24, -R14, R24;
        /*0278*/                   DADD R22, -R16, R22;
        /*0288*/                   DEPBAR.LE SB5, 0x4, {3};
        /*0290*/                   DADD R16, R18, -R16;
        /*0298*/                   DADD R14, R20, -R14;
        /*02a8*/                   DMUL R22, R22, -R24;
        /*02b0*/                   DFMA R14, R16, R14, R22;
        /*02b8*/                   F2F.F64.F64 R14, |R14|;
        /*02c8*/                   DEPBAR.LE SB5, 0x2;
        /*02d0*/                   DMUL R16, R6, c[0x3][0x0];
        /*02d8*/                   DMUL R20, R6.reuse, c[0x3][0x18];
        /*02e8*/                   DMUL R24, R6, c[0x3][0x30];
        /*02f0*/                   DFMA R18, R10.reuse, c[0x3][0x10], R16;
        /*02f8*/         {         DFMA R6, R10.reuse, c[0x3][0x28], R20;
        /*0308*/                   DEPBAR.LE SB5, 0x1, {4};        }
        /*0310*/                   DFMA R16, R12, c[0x3][0x8], R18;
        /*0318*/                   DFMA R10, R10, c[0x3][0x40], R24;
        /*0328*/                   DFMA R18, R12, c[0x3][0x20], R6;
        /*0330*/                   DMUL R6, R16, c[0x3][0x0];
        /*0338*/                   DFMA R12, R12, c[0x3][0x38], R10;
        /*0348*/                   DMUL R20, R6, c[0x3][0x48];
        /*0350*/                   DMUL R10, R18, c[0x3][0x18];
        /*0358*/                   DMUL R6, R12, c[0x3][0x30];
        /*0368*/                   DMUL R10, R10, c[0x3][0x50];
        /*0370*/                   DFMA R20, R20, R14.reuse, RZ;
        /*0378*/                   DMUL R6, R6, c[0x3][0x58];
        /*0388*/                   DFMA R20, R10, R14, R20;
        /*0390*/                   DFMA R10, R6, R14, R20;
        /*0398*/                   DADD R6, R10, R4;
        /*03a8*/                   ATOM.E.CAS.64 R6, [R8], R4, R6;
        /*03b0*/                   DMUL R20, R16, c[0x3][0x8];
        /*03b8*/                   DMUL R22, R16, c[0x3][0x10];
        /*03c8*/                   DMUL R16, R18.reuse, c[0x3][0x20];
        /*03d0*/                   DMUL R18, R18, c[0x3][0x28];
        /*03d8*/                   DMUL R20, R20, c[0x3][0x48];
        /*03e8*/                   DMUL R24, R22, c[0x3][0x48];
        /*03f0*/                   DMUL R16, R16, c[0x3][0x50];
        /*03f8*/                   DMUL R18, R18, c[0x3][0x50];
        /*0408*/                   DFMA R20, R20, R14.reuse, RZ;
        /*0410*/                   DFMA R24, R24, R14, RZ;
        /*0418*/                   DMUL R8, R12.reuse, c[0x3][0x38];
        /*0428*/                   DMUL R12, R12, c[0x3][0x40];
        /*0430*/                   IADD32I R0, R0, 0x1;
        /*0438*/                   DFMA R16, R16, R14.reuse, R20;
        /*0448*/                   DFMA R24, R18, R14.reuse, R24;
        /*0450*/                   DMUL R8, R8, c[0x3][0x58];
        /*0458*/                   DMUL R12, R12, c[0x3][0x58];
        /*0468*/                   ISETP.LT.AND P0, PT, R0, 0x800, PT;
        /*0470*/                   DFMA R18, R8, R14, R16;
        /*0478*/                   DFMA R14, R12, R14, R24;
        /*0488*/                   IADD RZ.CC, -R4, R6;
        /*0490*/                   ISETP.EQ.X.AND P1, PT, R7, R5, PT;
        /*0498*/               @P1 SYNC                                         (*"TARGET= .L_4 "*);
        /*04a8*/                   LDG.E.CI R4, [R2];
        /*04b0*/                   ISCADD R8.CC, R4.reuse, c[0x0][0x170], 0x3;
        /*04b8*/                   SHR R4, R4, 0x1d;
        /*04c8*/                   IADD.X R9, R4, c[0x0][0x174];
.L_5:
        /*04d0*/                   MOV R4, R8;
        /*04d8*/                   MOV R5, R9;
        /*04e8*/                   LDG.E.CV.64 R4, [R4];
        /*04f0*/                   DADD R6, R10, R4;
        /*04f8*/                   ATOM.E.CAS.64 R6, [R8], R4, R6;
        /*0508*/                   IADD RZ.CC, -R4, R6;
        /*0510*/                   ISETP.EQ.X.AND P1, PT, R7, R5, PT;
        /*0518*/              @!P1 BRA `(.L_5);
        /*0528*/                   SYNC                                         (*"TARGET= .L_4 "*);
.L_4:
        /*0530*/                   LDG.E.CI R4, [R2+0x4];
        /*0538*/                   SSY `(.L_6);
        /*0548*/                   ISCADD R8.CC, R4.reuse, c[0x0][0x170], 0x3;
        /*0550*/                   SHR R4, R4, 0x1d;
        /*0558*/                   IADD.X R9, R4, c[0x0][0x174];
.L_7:
        /*0568*/                   MOV R4, R8;
        /*0570*/                   MOV R5, R9;
        /*0578*/                   LDG.E.CV.64 R4, [R4];
        /*0588*/                   DADD R6, R18, R4;
        /*0590*/                   ATOM.E.CAS.64 R6, [R8], R4, R6;
        /*0598*/                   IADD RZ.CC, -R4, R6;
        /*05a8*/                   ISETP.NE.X.AND P1, PT, R7, R5, PT;
        /*05b0*/               @P1 BRA `(.L_7);
        /*05b8*/                   SYNC                                         (*"TARGET= .L_6 "*);
.L_6:
        /*05c8*/                   LDG.E.CI R2, [R2+0x8];
        /*05d0*/                   SSY `(.L_8);
        /*05d8*/                   ISCADD R8.CC, R2.reuse, c[0x0][0x170], 0x3;
        /*05e8*/                   SHR R4, R2, 0x1d;
        /*05f0*/                   IADD.X R9, R4, c[0x0][0x174];
.L_9:
        /*05f8*/                   MOV R2, R8;
        /*0608*/                   MOV R3, R9;
        /*0610*/                   LDG.E.CV.64 R4, [R2];
        /*0618*/                   DADD R6, R14, R4;
        /*0628*/                   ATOM.E.CAS.64 R6, [R8], R4, R6;
        /*0630*/                   IADD RZ.CC, -R4, R6;
        /*0638*/                   ISETP.NE.X.AND P1, PT, R7, R5, PT;
        /*0648*/               @P1 BRA `(.L_9);
        /*0650*/                   SYNC                                         (*"TARGET= .L_8 "*);
.L_8:
        /*0658*/               @P0 BRA `(.L_10);
        /*0668*/                   EXIT;
.L_11:
        /*0670*/                   BRA `(.L_11);
.L_76:

Therefore, below is the kernel which would need all the FLOPs

/*0018*/         {         XMAD R2, R0.reuse, 0x3, RZ;
/*0028*/                   SSY `(.L_4);        }
/*0030*/                   XMAD.PSL R2, R0.H1, 0x3, R2;
/*0038*/                   SHL R3, R2.reuse, 0x2;
/*0048*/                   SHR R2, R2, 0x1e;
/*0050*/                   IADD R4.CC, R3, c[0x0][0x168];
/*0058*/                   IADD.X R5, R2, c[0x0][0x16c];
/*0068*/                   LDG.E.CI R9, [R4];
/*0070*/                   LDG.E.CI R10, [R4+0x4];
/*0078*/                   LDG.E.CI R11, [R4+0x8];
/*0088*/                   IADD R6.CC, R3, c[0x0][0x150];
/*0090*/                   IADD.X R3, R2, c[0x0][0x154];
/*0098*/                   MOV R2, R6;
/*00a8*/                   LDG.E.CI R7, [R2];
/*00b0*/                   LDG.E.CI R6, [R2+0x8];
/*00b8*/                   LDG.E.CI R8, [R2+0x4];
/*00c8*/                   SHL R9, R9, 0x1;
/*00d0*/                   ISCADD R30.CC, R9.reuse, c[0x0][0x160], 0x3;
/*00d8*/                   SHR R9, R9, 0x1d;
/*00e8*/                   SHL R10, R10, 0x1;
/*00f0*/                   IADD.X R31, R9, c[0x0][0x164];
/*00f8*/         {         ISCADD R28.CC, R10.reuse, c[0x0][0x160], 0x3;
/*0108*/                   LDG.E.CI.64 R16, [R30];        }
/*0110*/         {         SHR R10, R10, 0x1d;
/*0118*/                   LDG.E.CI.64 R14, [R30+0x8];        }
/*0128*/                   DEPBAR.LE SB5, 0x3;
/*0130*/                   SHL R11, R11, 0x1;
/*0138*/                   IADD.X R29, R10, c[0x0][0x164];
/*0148*/         {         ISCADD R12.CC, R11.reuse, c[0x0][0x160], 0x3;
/*0150*/                   LDG.E.CI.64 R24, [R28+0x8];        }
/*0158*/         {         SHR R11, R11, 0x1d;
/*0168*/                   LDG.E.CI.64 R18, [R28];        }
/*0170*/                   DEPBAR.LE SB5, 0x3, {0};
/*0178*/                   SHL R4, R7.reuse, 0x3;
/*0188*/                   SHR R5, R7, 0x1d;
/*0190*/                   IADD.X R13, R11, c[0x0][0x164];
/*0198*/                   LDG.E.CI.64 R22, [R12];
/*01a8*/         {         IADD R10.CC, R4, c[0x0][0x148];
/*01b0*/                   LDG.E.CI.64 R20, [R12+0x8];        }
/*01b8*/         {         IADD.X R7, R5, c[0x0][0x14c];
/*01c8*/                   DEPBAR.LE SB5, 0x3;        }
/*01d0*/                   ISCADD R9.CC, R6.reuse, c[0x0][0x148], 0x3;
/*01d8*/                   SHR R6, R6, 0x1d;
/*01e8*/                   IADD.X R11, R6, c[0x0][0x14c];
/*01f0*/                   MOV R6, R10;
/*01f8*/                   LDG.E.CI.64 R6, [R6];
/*0208*/                   DEPBAR.LE SB5, 0x3;
/*0210*/                   ISCADD R26.CC, R8.reuse, c[0x0][0x148], 0x3;
/*0218*/                   MOV R10, R9;
/*0228*/         {         SHR R8, R8, 0x1d;
/*0230*/                   LDG.E.CI.64 R10, [R10];        }
/*0238*/                   IADD.X R27, R8, c[0x0][0x14c];
/*0248*/                   LDG.E.CI.64 R12, [R26];
/*0250*/                   IADD R8.CC, R4, c[0x0][0x170];
/*0258*/                   IADD.X R9, R5, c[0x0][0x174];
/*0268*/                   LDG.E.CV.64 R4, [R8];
/*0270*/                   DADD R24, -R14, R24;
/*0278*/                   DADD R22, -R16, R22;
/*0288*/                   DEPBAR.LE SB5, 0x4, {3};
/*0290*/                   DADD R16, R18, -R16;
/*0298*/                   DADD R14, R20, -R14;
/*02a8*/                   DMUL R22, R22, -R24;
/*02b0*/                   DFMA R14, R16, R14, R22;
/*02b8*/                   F2F.F64.F64 R14, |R14|;
/*02c8*/                   DEPBAR.LE SB5, 0x2;
/*02d0*/                   DMUL R16, R6, c[0x3][0x0];
/*02d8*/                   DMUL R20, R6.reuse, c[0x3][0x18];
/*02e8*/                   DMUL R24, R6, c[0x3][0x30];
/*02f0*/                   DFMA R18, R10.reuse, c[0x3][0x10], R16;
/*02f8*/         {         DFMA R6, R10.reuse, c[0x3][0x28], R20;
/*0308*/                   DEPBAR.LE SB5, 0x1, {4};        }
/*0310*/                   DFMA R16, R12, c[0x3][0x8], R18;
/*0318*/                   DFMA R10, R10, c[0x3][0x40], R24;
/*0328*/                   DFMA R18, R12, c[0x3][0x20], R6;
/*0330*/                   DMUL R6, R16, c[0x3][0x0];
/*0338*/                   DFMA R12, R12, c[0x3][0x38], R10;
/*0348*/                   DMUL R20, R6, c[0x3][0x48];
/*0350*/                   DMUL R10, R18, c[0x3][0x18];
/*0358*/                   DMUL R6, R12, c[0x3][0x30];
/*0368*/                   DMUL R10, R10, c[0x3][0x50];
/*0370*/                   DFMA R20, R20, R14.reuse, RZ;
/*0378*/                   DMUL R6, R6, c[0x3][0x58];
/*0388*/                   DFMA R20, R10, R14, R20;
/*0390*/                   DFMA R10, R6, R14, R20;
/*0398*/                   DADD R6, R10, R4;
/*03a8*/                   ATOM.E.CAS.64 R6, [R8], R4, R6;
/*03b0*/                   DMUL R20, R16, c[0x3][0x8];
/*03b8*/                   DMUL R22, R16, c[0x3][0x10];
/*03c8*/                   DMUL R16, R18.reuse, c[0x3][0x20];
/*03d0*/                   DMUL R18, R18, c[0x3][0x28];
/*03d8*/                   DMUL R20, R20, c[0x3][0x48];
/*03e8*/                   DMUL R24, R22, c[0x3][0x48];
/*03f0*/                   DMUL R16, R16, c[0x3][0x50];
/*03f8*/                   DMUL R18, R18, c[0x3][0x50];
/*0408*/                   DFMA R20, R20, R14.reuse, RZ;
/*0410*/                   DFMA R24, R24, R14, RZ;
/*0418*/                   DMUL R8, R12.reuse, c[0x3][0x38];
/*0428*/                   DMUL R12, R12, c[0x3][0x40];
/*0430*/                   IADD32I R0, R0, 0x1;
/*0438*/                   DFMA R16, R16, R14.reuse, R20;
/*0448*/                   DFMA R24, R18, R14.reuse, R24;
/*0450*/                   DMUL R8, R8, c[0x3][0x58];
/*0458*/                   DMUL R12, R12, c[0x3][0x58];
/*0468*/                   ISETP.LT.AND P0, PT, R0, 0x800, PT;
/*0470*/                   DFMA R18, R8, R14, R16;
/*0478*/                   DFMA R14, R12, R14, R24;
/*0488*/                   IADD RZ.CC, -R4, R6;
/*0490*/                   ISETP.EQ.X.AND P1, PT, R7, R5, PT;

Running a quick find of DMUL, DADD and DFMA.

  1. DFMA: 16
  2. DADD: 8
  3. DMUL: 22

Predicted by loopy: 84*nelements.

Total of 46*nelements FLOPs.

This seems reasonable looking at the number of DFMA operations in the above ELF.

Doing the same thing for the laplace matrix:

The ptxas output:

ptxas info    : 0 bytes gmem, 52 bytes cmem[3]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 52 registers, 380 bytes cmem[0], 36 bytes cmem[2]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 380 bytes cmem[0]

Predicted by loopy: 504*nelements.

Calculated by hand, using the output from nvdisasm

  1. Number of divs: 8 = 8*(11DMUL + 11DFMA)
  2. Number DFMAs: 47
  3. Number of DADDs: 6
  4. Number of DMULs: 17

Total FlOPS: 738*nelements.

Mixed problem:

ptxas info    : 0 bytes gmem, 148 bytes cmem[3]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
    24 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 55 registers, 412 bytes cmem[0], 36 bytes cmem[2]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 11 registers, 412 bytes cmem[0]

It is becoming difficult to calculate the value of FLOPs from the ptx file. The number reported by Loopy is 462*nelements

Hyperelasticity:

ptxas info    : 0 bytes gmem, 52 bytes cmem[3]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 72 registers, 388 bytes cmem[0], 56 bytes cmem[2]
ptxas info    : Compiling entry function 'loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel' for 'sm_52'
ptxas info    : Function properties for loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 388 bytes cmem[0]

Reported by Loopy: 48936*nelements

Calculations by hand from the output of nvdisasm:

  1. DFMA: 151
  2. DADD: 16
  3. DMUL: 52
  4. DDIV: 20(Equuivalent to 440 FLOPs)

Therefore Total FLOPs by hand calculation = 1977*nelements

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment