Forked from genkuroki/Benchmark test of Monte Carlo calculation of pi (Julia v1.1.1).ipynb
Created
February 24, 2018 19:45
-
-
Save giruzou/59013959681fc5eaf087bf9192ab546e to your computer and use it in GitHub Desktop.
C/Julia vs. gcc/Benchmark test of Monte Carlo calculation of pi.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Benchmark test of Monte Carlo calculation of pi\n\ngcc Allied Forces vs. Julia Imperial Military \n\nGen Kuroki\n\n2017-08-20" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "versioninfo()", | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Julia Version 0.6.0\nCommit 903644385b* (2017-06-19 13:05 UTC)\nPlatform Info:\n OS: Windows (x86_64-w64-mingw32)\n CPU: Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz\n WORD_SIZE: 64\n BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)\n LAPACK: libopenblas64_\n LIBM: libopenlibm\n LLVM: libLLVM-3.9.1 (ORCJIT, haswell)\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# julia -p auto\n\nnprocs()", | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 2, | |
"data": { | |
"text/plain": "9" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "run(`gcc --version`)", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "gcc (Rev5, Built by MSYS2 project) 5.3.0\r\nCopyright (C) 2015 Free Software Foundation, Inc.\r\nThis is free software; see the source for copying conditions. There is NO\r\nwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\r\n\r\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "ENV[\"OMP_NUM_THREADS\"]", | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 4, | |
"data": { | |
"text/plain": "\"8\"" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "using BenchmarkTools", | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "function findpi_julia(n)\n s = 0\n for j in 1:n\n s += ifelse(rand()^2+rand()^2 ≤ 1, 1, 0)\n end\n return 4.0 * s / n\nend\nfindpi_julia(10^5)", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 6, | |
"data": { | |
"text/plain": "3.14792" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_julia(10^8)", | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.412957 seconds (265 allocations: 18.252 KiB)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 7, | |
"data": { | |
"text/plain": "3.14171532" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# julia -p auto\n\nfunction findpi_julia_parallel(n)\n s = @parallel (+) for j in 1:n\n ifelse(rand()^2+rand()^2 ≤ 1, 1, 0)\n end\n return 4.0 * s / n\nend\nfindpi_julia_parallel(10^5)", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 8, | |
"data": { | |
"text/plain": "3.14716" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_julia_parallel(10^8)", | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.152859 seconds (1.23 k allocations: 100.344 KiB)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 9, | |
"data": { | |
"text/plain": "3.14155552" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "C_code= raw\"\"\"\n#include <time.h>\n#include <stdlib.h>\n\ndouble findpi(unsigned long n){\n double x,y;\n srand((unsigned)time(NULL));\n double R = (double) RAND_MAX;\n unsigned long count = 0;\n for(unsigned long j=0; j < n; j++){\n x = ((double) rand())/R;\n y = ((double) rand())/R;\n if(x*x + y*y <= 1){\n count++;\n }\n }\n return ((double) 4.0)*((double) count)/((double) n);\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 109681 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_A66.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_rand_lib = filename\nfindpi_gcc_rand(n::Int64) = ccall((:findpi, findpi_gcc_rand_lib), Float64, (Int64,), n)\nfindpi_gcc_rand(10^5)", | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 11, | |
"data": { | |
"text/plain": "3.14116" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_rand(10^8)", | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 2.572927 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 12, | |
"data": { | |
"text/plain": "3.14125296" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "C_code= raw\"\"\"\n#include <time.h>\n#include <stdlib.h>\n\ndouble findpi(unsigned long n){\n unsigned long x,y;\n srand((unsigned)time(NULL));\n double RR = (unsigned long) RAND_MAX*RAND_MAX;\n unsigned long count = 0;\n for(unsigned long j=0; j < n; j++){\n x = (unsigned long) rand();\n y = (unsigned long) rand();\n if(x*x + y*y <= RR){\n count++;\n }\n }\n return ((double) 4.0)*((double) count)/((double) n);\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 109681 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_18B0.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_rand_int_lib = filename\nfindpi_gcc_rand_int(n::Int64) = ccall((:findpi, findpi_gcc_rand_int_lib), Float64, (Int64,), n)\nfindpi_gcc_rand_int(10^5)", | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 14, | |
"data": { | |
"text/plain": "3.13108" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_rand_int(10^8)", | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 1.871264 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 15, | |
"data": { | |
"text/plain": "3.14121536" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.sep.tgz\n#\n# Extract\n# mt19937ar.sep/mt19937ar.h\n# mt19937ar.sep/mt19937ar.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include \"mt19937ar.sep/mt19937ar.h\"\n#include \"mt19937ar.sep/mt19937ar.c\"\n\ndouble findpi(unsigned long n){\n srand((unsigned)time(NULL));\n init_genrand(rand());\n unsigned long count = 0;\n double x,y;\n for(unsigned long j = 0; j < n; j++){\n x = genrand_real1();\n y = genrand_real1();\n if(x*x + y*y <= 1){\n count++;\n }\n }\n return ((double)4.0)*((double)count)/((double)n);\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 111025 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_20A0.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_MT_lib = filename\nfindpi_gcc_MT(n::Int64) = ccall((:findpi, findpi_gcc_MT_lib), Float64, (Int64,), n)\nfindpi_gcc_MT(10^5)", | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 17, | |
"data": { | |
"text/plain": "3.13792" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_MT(10^8)", | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.881956 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 18, | |
"data": { | |
"text/plain": "3.14160724" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/mt19937-64stdint.tgz\n#\n# Extract\n# mt19937-64/mt64.h\n# mt19937-64/mt19937-64.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#define HAVE_SSE2\n#include \"mt19937-64/mt64.h\"\n#include \"mt19937-64/mt19937-64.c\"\n\ndouble findpi(uint64_t n){\n srand((unsigned)time(NULL));\n uint64_t init[4]={(uint64_t)rand(), (uint64_t)rand(), (uint64_t)rand(), (uint64_t)rand()};\n uint64_t length = 4;\n init_by_array64(init, length);\n uint64_t count = 0;\n double x,y;\n for(uint64_t j = 0; j < n; j++){\n x = genrand64_real2();\n y = genrand64_real2();\n if(x*x + y*y <= 1){\n count++;\n }\n }\n return ((double)4.0)*((double)count)/((double)n);\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 113055 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_24E7.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_MT64_lib = filename\nfindpi_gcc_MT64(n::Int64) = ccall((:findpi, findpi_gcc_MT64_lib), Float64, (Int64,), n)\nfindpi_gcc_MT64(10^5)", | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 20, | |
"data": { | |
"text/plain": "3.1444" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_MT64(10^8)", | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 1.277221 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 21, | |
"data": { | |
"text/plain": "3.14161424" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz\n#\n# Extract\n# dSFMT-src-2.2.3/dSFMT.h\n# dSFMT-src-2.2.3/dSFMT.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#define HAVE_SSE2\n#define DSFMT_MEXP 521\n#include \"dSFMT-src-2.2.3/dSFMT.h\"\n#include \"dSFMT-src-2.2.3/dSFMT.c\"\n\ndouble findpi(unsigned long n){\n srand((unsigned)time(NULL));\n dsfmt_t dsfmt;\n dsfmt_init_gen_rand(&dsfmt, rand());\n unsigned long count = 0;\n double x,y;\n for(unsigned long j = 0; j < n; j++){\n x = dsfmt_genrand_close_open(&dsfmt);\n y = dsfmt_genrand_close_open(&dsfmt);\n if(x*x + y*y <= 1){\n count++;\n }\n }\n return ((double)4.0)*((double)count)/((double)n);\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 125683 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_2AD3.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_dSFMT_lib = filename\nfindpi_gcc_dSFMT(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_lib), Float64, (Int64,), n)\nfindpi_gcc_dSFMT(10^5)", | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 23, | |
"data": { | |
"text/plain": "3.1324" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_dSFMT(10^8)", | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.315492 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 24, | |
"data": { | |
"text/plain": "3.14122552" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz\n#\n# Extract\n# dSFMT-src-2.2.3/dSFMT.h\n# dSFMT-src-2.2.3/dSFMT.c\n\n# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7\n# findpi_dSMFT_openmp.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <time.h>\n#include <stdlib.h>\n#include <omp.h>\n#define HAVE_SSE2\n#define DSFMT_MEXP 521\n#include \"dSFMT-src-2.2.3/dSFMT.h\"\n#include \"dSFMT-src-2.2.3/dSFMT.c\"\n\ndouble findpi(unsigned long n){\n srand((unsigned)time(NULL));\n unsigned long threads = omp_get_num_procs();\n n = (n+threads-1)/threads*threads;\n const unsigned long size = n/threads;\n\n dsfmt_t dsfmt[threads];\n for(unsigned long i=0; i<threads; i++){\n dsfmt_init_gen_rand(dsfmt+i, rand());\n }\n\n unsigned long count = 0;\n#pragma omp parallel for reduction(+:count)\n for(unsigned long i=0; i<threads; i++){\n for(unsigned long j=0; j<size; j++){\n double x = dsfmt_genrand_close_open(dsfmt+i);\n double y = dsfmt_genrand_close_open(dsfmt+i);\n if(x*x+y*y<=1){\n count++;\n }\n }\n }\n return 4.0*count/n;\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -Wall -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 127084 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_2DA3.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_dSFMT_OMP_lib = filename\nfindpi_gcc_dSFMT_OMP(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_lib), Float64, (Int64,), n)\nfindpi_gcc_dSFMT_OMP(10^5)", | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 26, | |
"data": { | |
"text/plain": "3.146" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_dSFMT_OMP(10^8)", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.167171 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 27, | |
"data": { | |
"text/plain": "3.14169804" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz\n#\n# Extract\n# dSFMT-src-2.2.3/dSFMT.h\n# dSFMT-src-2.2.3/dSFMT.c\n\n# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7\n# findpi_dSMFT_openmp_sse.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <time.h>\n#include <stdlib.h>\n#include <omp.h>\n#include <emmintrin.h>\n#define HAVE_SSE2\n#define DSFMT_MEXP 521\n#include \"dSFMT-src-2.2.3/dSFMT.h\"\n#include \"dSFMT-src-2.2.3/dSFMT.c\"\n\ndouble findpi(unsigned long n){\n srand((unsigned)time(NULL));\n unsigned long threads = omp_get_num_procs();\n const unsigned long N = 8;\n unsigned long width = threads * N;\n n = (n+width-1)/width*width;\n const unsigned long size = n/threads;\n\n dsfmt_t dsfmt[threads];\n for(unsigned long i=0; i<threads; i++){\n dsfmt_init_gen_rand(dsfmt+i, rand());\n }\n\n unsigned long count = 0;\n#pragma omp parallel for reduction(+:count)\n for(unsigned long i=0; i<threads; i++){\n __m128d one;\n {\n double one_d = 1;\n one = _mm_loadl_pd(one, &one_d);\n one = _mm_loadh_pd(one, &one_d);\n }\n double s[2*N];\n double t[N/2][2];\n __m128d x0, x1, x2, x3;\n __m128d y0, y1, y2, y3;\n __m128d c0, c1, c2, c3;\n for(unsigned long j=0; j<size; j+=N){\n dsfmt_fill_array_close1_open2(dsfmt+i, s, 2*N);\n x0 = _mm_loadu_pd(s+0);\n y0 = _mm_loadu_pd(s+2);\n x1 = _mm_loadu_pd(s+4);\n y1 = _mm_loadu_pd(s+6);\n x2 = _mm_loadu_pd(s+8);\n y2 = _mm_loadu_pd(s+10);\n x3 = _mm_loadu_pd(s+12);\n y3 = _mm_loadu_pd(s+14);\n\n x0 = _mm_sub_pd(x0, one);\n y0 = _mm_sub_pd(y0, one);\n x1 = _mm_sub_pd(x1, one);\n y1 = _mm_sub_pd(y1, one);\n x2 = _mm_sub_pd(x2, one);\n y2 = _mm_sub_pd(y2, one);\n x3 = _mm_sub_pd(x3, one);\n y3 = _mm_sub_pd(y3, one);\n\n x0 = _mm_mul_pd(x0, x0);\n y0 = _mm_mul_pd(y0, y0);\n x1 = _mm_mul_pd(x1, x1);\n y1 = _mm_mul_pd(y1, y1);\n x2 = _mm_mul_pd(x2, x2);\n y2 = _mm_mul_pd(y2, y2);\n x3 = _mm_mul_pd(x3, x3);\n y3 = _mm_mul_pd(y3, y3);\n\n x0 = _mm_add_pd(x0, y0);\n x1 = _mm_add_pd(x1, y1);\n x2 = _mm_add_pd(x2, y2);\n x3 = _mm_add_pd(x3, y3);\n c0 = _mm_cmple_pd(x0, one);\n c1 = _mm_cmple_pd(x1, one);\n c2 = _mm_cmple_pd(x2, one);\n c3 = _mm_cmple_pd(x3, one);\n\n _mm_storeu_pd(t[0], c0);\n _mm_storeu_pd(t[1], c1);\n _mm_storeu_pd(t[2], c2);\n _mm_storeu_pd(t[3], c3);\n\n count +=\n (t[0][0]!=0)+(t[0][1]!=0)\n +(t[1][0]!=0)+(t[1][1]!=0)\n +(t[2][0]!=0)+(t[2][1]!=0)\n +(t[3][0]!=0)+(t[3][1]!=0);\n }\n }\n return 4.0*count/n;\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 127596 Aug 21 00:50 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_2FF6.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_dSFMT_OMP_SSE_lib = filename\nfindpi_gcc_dSFMT_OMP_SSE(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_SSE_lib), Float64, (Int64,), n)\nfindpi_gcc_dSFMT_OMP_SSE(10^5)", | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 29, | |
"data": { | |
"text/plain": "3.1392354446577095" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_dSFMT_OMP_SSE(10^8)", | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.181919 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 30, | |
"data": { | |
"text/plain": "3.14132084" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz\n#\n# Extract\n# dSFMT-src-2.2.3/dSFMT.h\n# dSFMT-src-2.2.3/dSFMT.c\n\n# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7\n# findpi_dSMFT_openmp_fma.c\n\nC_code= raw\"\"\"\n#include <stdio.h>\n#include <time.h>\n#include <stdlib.h>\n#include <omp.h>\n#include <immintrin.h>\n#define HAVE_SSE2\n#define DSFMT_MEXP 521\n#include \"dSFMT-src-2.2.3/dSFMT.h\"\n#include \"dSFMT-src-2.2.3/dSFMT.c\"\n\ndouble findpi(unsigned long n){\n srand((unsigned)time(NULL));\n unsigned long threads = omp_get_num_procs();\n const unsigned long N = 32;\n unsigned long width = threads * N;\n n = (n+width-1)/width*width;\n const unsigned long size = n/threads;\n\n unsigned long seed[threads];\n for(unsigned long i=0; i<threads; i++) seed[i] = rand();\n\n unsigned long count = 0;\n#pragma omp parallel for reduction(+:count)\n for(unsigned long i=0; i<threads; i++){\n dsfmt_t dsfmt;\n dsfmt_init_gen_rand(&dsfmt, seed[i]);\n __m256d one;\n {\n double one_d = 1;\n one = _mm256_broadcast_sd(&one_d);\n }\n double s[2*N];\n long long t[N/4][4];\n __m256d x0, x1, x2, x3, x4, x5, x6, x7;\n __m256d y0, y1, y2, y3, y4, y5, y6, y7;\n __m256d c0, c1, c2, c3, c4, c5, c6, c7;\n for(unsigned long j=0; j<size; j+=N){\n dsfmt_fill_array_close1_open2(&dsfmt, s, 2*N);\n\n x0 = _mm256_loadu_pd(s+ 0);\n x1 = _mm256_loadu_pd(s+ 4);\n x2 = _mm256_loadu_pd(s+ 8);\n x3 = _mm256_loadu_pd(s+12);\n x4 = _mm256_loadu_pd(s+16);\n x5 = _mm256_loadu_pd(s+20);\n x6 = _mm256_loadu_pd(s+24);\n x7 = _mm256_loadu_pd(s+28);\n\n y0 = _mm256_loadu_pd(s+32);\n y1 = _mm256_loadu_pd(s+36);\n y2 = _mm256_loadu_pd(s+40);\n y3 = _mm256_loadu_pd(s+44);\n y4 = _mm256_loadu_pd(s+48);\n y5 = _mm256_loadu_pd(s+52);\n y6 = _mm256_loadu_pd(s+56);\n y7 = _mm256_loadu_pd(s+60);\n\n x0 = _mm256_sub_pd(x0, one);\n x1 = _mm256_sub_pd(x1, one);\n x2 = _mm256_sub_pd(x2, one);\n x3 = _mm256_sub_pd(x3, one);\n x4 = _mm256_sub_pd(x4, one);\n x5 = _mm256_sub_pd(x5, one);\n x6 = _mm256_sub_pd(x6, one);\n x7 = _mm256_sub_pd(x7, one);\n\n y0 = _mm256_sub_pd(y0, one);\n y1 = _mm256_sub_pd(y1, one);\n y2 = _mm256_sub_pd(y2, one);\n y3 = _mm256_sub_pd(y3, one);\n y4 = _mm256_sub_pd(y4, one);\n y5 = _mm256_sub_pd(y5, one);\n y6 = _mm256_sub_pd(y6, one);\n y7 = _mm256_sub_pd(y7, one);\n\n x0 = _mm256_mul_pd(x0, x0);\n x1 = _mm256_mul_pd(x1, x1);\n x2 = _mm256_mul_pd(x2, x2);\n x3 = _mm256_mul_pd(x3, x3);\n x4 = _mm256_mul_pd(x4, x4);\n x5 = _mm256_mul_pd(x5, x5);\n x6 = _mm256_mul_pd(x6, x6);\n x7 = _mm256_mul_pd(x7, x7);\n\n x0 = _mm256_fmadd_pd(y0, y0, x0);\n x1 = _mm256_fmadd_pd(y1, y1, x1);\n x2 = _mm256_fmadd_pd(y2, y2, x2);\n x3 = _mm256_fmadd_pd(y3, y3, x3);\n x4 = _mm256_fmadd_pd(y4, y4, x4);\n x5 = _mm256_fmadd_pd(y5, y5, x5);\n x6 = _mm256_fmadd_pd(y6, y6, x6);\n x7 = _mm256_fmadd_pd(y7, y7, x7);\n\n c0 = _mm256_cmp_pd(x0, one, _CMP_LE_OQ);\n c1 = _mm256_cmp_pd(x1, one, _CMP_LE_OQ);\n c2 = _mm256_cmp_pd(x2, one, _CMP_LE_OQ);\n c3 = _mm256_cmp_pd(x3, one, _CMP_LE_OQ);\n c4 = _mm256_cmp_pd(x4, one, _CMP_LE_OQ);\n c5 = _mm256_cmp_pd(x5, one, _CMP_LE_OQ);\n c6 = _mm256_cmp_pd(x6, one, _CMP_LE_OQ);\n c7 = _mm256_cmp_pd(x7, one, _CMP_LE_OQ);\n\n _mm256_storeu_pd((double*)t[0], c0);\n _mm256_storeu_pd((double*)t[1], c1);\n _mm256_storeu_pd((double*)t[2], c2);\n _mm256_storeu_pd((double*)t[3], c3);\n _mm256_storeu_pd((double*)t[4], c4);\n _mm256_storeu_pd((double*)t[5], c5);\n _mm256_storeu_pd((double*)t[6], c6);\n _mm256_storeu_pd((double*)t[7], c7);\n\n unsigned int f =\n ((t[0][0]&1 | t[0][1]&2 | t[0][2]&4 | t[0][3]&8) << 0) |\n ((t[1][0]&1 | t[1][1]&2 | t[1][2]&4 | t[1][3]&8) << 4) |\n ((t[2][0]&1 | t[2][1]&2 | t[2][2]&4 | t[2][3]&8) << 8) |\n ((t[3][0]&1 | t[3][1]&2 | t[3][2]&4 | t[3][3]&8) << 12) |\n ((t[4][0]&1 | t[4][1]&2 | t[4][2]&4 | t[4][3]&8) << 16) |\n ((t[5][0]&1 | t[5][1]&2 | t[5][2]&4 | t[5][3]&8) << 20) |\n ((t[6][0]&1 | t[6][1]&2 | t[6][2]&4 | t[6][3]&8) << 24) |\n ((t[7][0]&1 | t[7][1]&2 | t[7][2]&4 | t[7][3]&8) << 28) ;\n\n f = (f&0x55555555) + ((f>> 1)&0x55555555);\n f = (f&0x33333333) + ((f>> 2)&0x33333333);\n f = (f&0x0f0f0f0f) + ((f>> 4)&0x0f0f0f0f);\n f = (f&0x00ff00ff) + ((f>> 8)&0x00ff00ff);\n f = (f&0x0000ffff) + ((f>>16)&0x0000ffff);\n count += f;\n }\n }\n return 4.0*count/n;\n}\n\"\"\"\n\nfilename = tempname()\nfilenamedl = filename * \".\" * Libdl.dlext\n\nopen(`gcc -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, \"w\") do f\n print(f, C_code)\nend\n\nrun(`ls -l $filenamedl`)", | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "-rwxr-xr-x 1 genkuroki 197610 128620 Aug 21 00:51 C:\\Users\\GENKUR~1\\AppData\\Local\\Temp\\jl_3258.tmp.dll\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "const findpi_gcc_dSFMT_OMP_FMA_lib = filename\nfindpi_gcc_dSFMT_OMP_FMA(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_FMA_lib), Float64, (Int64,), n)\nfindpi_gcc_dSFMT_OMP_FMA(10^5)", | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 32, | |
"data": { | |
"text/plain": "3.141983695652174" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_gcc_dSFMT_OMP_FMA(10^8)", | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 0.073832 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 33, | |
"data": { | |
"text/plain": "3.14154444" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@time findpi_julia(10^9)\n@time findpi_gcc_rand(10^9)\n@time findpi_gcc_rand_int(10^9)\n@time findpi_gcc_MT(10^9)\n@time findpi_gcc_dSFMT(10^9)", | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 4.011467 seconds (6 allocations: 192 bytes)\n 25.206722 seconds (6 allocations: 192 bytes)\n 18.465632 seconds (6 allocations: 192 bytes)\n 8.741574 seconds (6 allocations: 192 bytes)\n 3.039745 seconds (6 allocations: 192 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 34, | |
"data": { | |
"text/plain": "3.141516528" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@btime findpi_julia(10^9)\n@btime findpi_gcc_MT64(10^9)\n@btime findpi_gcc_MT(10^9)\n@btime findpi_gcc_dSFMT(10^9)", | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 4.002 s (0 allocations: 0 bytes)\n 13.203 s (0 allocations: 0 bytes)\n 8.826 s (0 allocations: 0 bytes)\n 2.850 s (0 allocations: 0 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 35, | |
"data": { | |
"text/plain": "3.141591928" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "@btime findpi_julia_parallel(10^9)\n@btime findpi_gcc_dSFMT_OMP(10^9)\n@btime findpi_gcc_dSFMT_OMP_SSE(10^9)\n@btime findpi_gcc_dSFMT_OMP_FMA(10^9)", | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": " 1.248 s (1162 allocations: 99.25 KiB)\n 1.445 s (0 allocations: 0 bytes)\n 1.723 s (0 allocations: 0 bytes)\n 546.748 ms (0 allocations: 0 bytes)\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 36, | |
"data": { | |
"text/plain": "3.141593752" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"collapsed": true, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/c9ae4e03fb379235ac6b26f5d62e4da5" | |
}, | |
"gist": { | |
"id": "c9ae4e03fb379235ac6b26f5d62e4da5", | |
"data": { | |
"description": "C/Julia vs. gcc/Benchmark test of Monte Carlo calculation of pi.ipynb", | |
"public": true | |
} | |
}, | |
"kernelspec": { | |
"name": "julia-0.6-pauto", | |
"display_name": "Julia 0.6.0 procs auto", | |
"language": "julia" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"name": "julia", | |
"mimetype": "application/julia", | |
"version": "0.6.0" | |
}, | |
"toc": { | |
"threshold": 4, | |
"number_sections": true, | |
"toc_cell": false, | |
"toc_window_display": false, | |
"toc_section_display": "block", | |
"sideBar": true, | |
"navigate_menu": true, | |
"moveMenuLeft": true, | |
"widenNotebook": false, | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"selected_highlight": "#FFD700", | |
"running_highlight": "#FF0000", | |
"wrapper_background": "#FFFFFF", | |
"sidebar_border": "#EEEEEE", | |
"navigate_text": "#333333", | |
"navigate_num": "#000000" | |
}, | |
"nav_menu": { | |
"height": "12px", | |
"width": "252px" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment