Last active
January 3, 2016 01:29
-
-
Save culurciello/8389426 to your computer and use it in GitHub Desktop.
test of computer speed with dummy math/lin algebra code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| Test of C code speed | |
| compile with: gcc -Ofast -fopenmp -mavx mactest.c | |
| or gfortran -O3 -fopenmp mactest.c | |
| */ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <fcntl.h> | |
| #include <sys/time.h> | |
| #include <math.h> | |
| #include <omp.h> | |
| int main() | |
| { | |
| int i,j,k,l; | |
| // nb of operations: | |
| int dsize = 1024*64; | |
| int nthreads = 8; | |
| int vcrz = 8; // vectorize by # | |
| int nbOfAverages = 1024; | |
| int opsMAC = 2; // operations per MAC | |
| float a[nthreads][dsize], b[nthreads][dsize]; | |
| long tops; //total ops | |
| struct timeval start,end; | |
| gettimeofday(&start, NULL); | |
| // this to test if all threads are running: | |
| //for (l=1;l<1000000;l++) { | |
| #pragma omp parallel for private (i,j,k) | |
| for (k=0; k<nthreads; k++) { | |
| //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); | |
| for(i=0;i<nbOfAverages;i++) { | |
| for (j = 0; j < (dsize-vcrz+1)/vcrz; j=j+vcrz) { | |
| //#pragma simd | |
| //#pragma vector aligned | |
| a[k][j] = a[k][j] * b[k][j]; // MAC operations | |
| a[k][j+1] = a[k][j+1] * b[k][j+1]; | |
| a[k][j+2] = a[k][j+2] * b[k][j+2]; | |
| a[k][j+3] = a[k][j+3] * b[k][j+3]; | |
| a[k][j+4] = a[k][j+4] * b[k][j+4]; | |
| a[k][j+5] = a[k][j+5] * b[k][j+5]; | |
| a[k][j+6] = a[k][j+6] * b[k][j+6]; | |
| a[k][j+7] = a[k][j+7] * b[k][j+7]; | |
| } | |
| } | |
| } | |
| //} | |
| gettimeofday(&end, NULL); | |
| double t = ((double) (end.tv_sec - start.tv_sec)) | |
| + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! | |
| // report performance: | |
| tops = nthreads * opsMAC * nbOfAverages * dsize; // total ops | |
| //printf("\nclockstart, clockedn, CLOCKS_PER_SEC: %ld,%ld,%d", start, end, CLOCKS_PER_SEC); | |
| printf("\nTotal ops = %d, # of treads = %d", tops, nthreads); | |
| printf("\nTime in s: %lf:", t); | |
| printf("\nTest performance [G OP/s] %lf:", tops/t/1e9); | |
| printf("\n"); | |
| return(0); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Your pragmas indicate you're using the Intel compiler. Just replace your k and j loops with a call to DAXPY and use the "-mkl=parallel" flag. That will maximize your SIMD and multithreaded performance.