Created
December 8, 2014 17:47
-
-
Save seibert/50b725012fbda101013b to your computer and use it in GitHub Desktop.
Simple Numba Benchmarks for comparing AVX and SSE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:2e116b63cf60314c65252af07ea0c70a2ca09c9871efb5e7ef80edf1ab32b037" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import os\n", | |
"os.environ['NUMBA_ENABLE_AVX'] = '1'\n", | |
"import numpy as np\n", | |
"import numba\n", | |
"import math" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"a = np.arange(10000).astype(np.float32)\n", | |
"b = a.copy()\n", | |
"out = np.empty_like(a)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"@numba.jit\n", | |
"def do_basic(a, b, out):\n", | |
" for i in range(a.shape[0]):\n", | |
" out[i] = a[i] + b[i] * math.fabs(a[i])\n", | |
"\n", | |
"@numba.jit\n", | |
"def do_numpy_cos(a, out):\n", | |
" for i in range(a.shape[0]):\n", | |
" out[i] = np.cos(a[i])\n", | |
"\n", | |
"@numba.jit\n", | |
"def do_math_cos(a, out):\n", | |
" for i in range(a.shape[0]):\n", | |
" out[i] = math.cos(a[i])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Only want to inspect the assembly for the first function to make sure AVX \n", | |
"# is properly on (vaddps, vmulps, etc) or off (addps, mulps, etc).\n", | |
"numba.config.DUMP_ASSEMBLY = 1\n", | |
"do_basic(a, b, out)\n", | |
"numba.config.DUMP_ASSEMBLY = 0\n", | |
"# trigger the JIT for the remaining functions\n", | |
"do_numpy_cos(a, out)\n", | |
"do_math_cos(a, out)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"-------------------------------ASSEMBLY do_basic--------------------------------\n", | |
"\t.section\t__TEXT,__text,regular,pure_instructions\n", | |
"\t.macosx_version_min 14, 0\n", | |
"\t.section\t__TEXT,__const\n", | |
"\t.align\t5\n", | |
"LCPI0_0:\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.section\t__TEXT,__literal16,16byte_literals\n", | |
"\t.align\t4\n", | |
"LCPI0_1:\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.section\t__TEXT,__text,regular,pure_instructions\n", | |
"\t.globl\t\"___main__.do_basic.array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst)\"\n", | |
"\t.align\t4, 0x90\n", | |
"\"___main__.do_basic.array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst)\":\n", | |
"\tpushq\t%rbp\n", | |
"\tpushq\t%r15\n", | |
"\tpushq\t%r14\n", | |
"\tpushq\t%rbx\n", | |
"\tmovq\t32(%rdx), %rax\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tjle\tLBB0_19\n", | |
"\tmovq\t24(%rdx), %r9\n", | |
"\tmovq\t24(%rcx), %r10\n", | |
"\tmovq\t24(%r8), %r8\n", | |
"\txorl\t%esi, %esi\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tje\tLBB0_13\n", | |
"\txorl\t%esi, %esi\n", | |
"\tmovq\t%rax, %r11\n", | |
"\tmovq\t%rax, %r14\n", | |
"\tandq\t$-8, %r11\n", | |
"\tje\tLBB0_12\n", | |
"\tleaq\t-1(%rax), %rcx\n", | |
"\tleaq\t-4(%r8,%rax,4), %rdx\n", | |
"\tleaq\t(%r9,%rcx,4), %rsi\n", | |
"\tcmpq\t%rsi, %r8\n", | |
"\tsetbe\t%bpl\n", | |
"\tcmpq\t%rdx, %r9\n", | |
"\tsetbe\t%bl\n", | |
"\tleaq\t(%r10,%rcx,4), %rcx\n", | |
"\tcmpq\t%rcx, %r8\n", | |
"\tsetbe\t%cl\n", | |
"\tcmpq\t%rdx, %r10\n", | |
"\tsetbe\t%dl\n", | |
"\txorl\t%esi, %esi\n", | |
"\tmovq\t%rax, %r14\n", | |
"\ttestb\t%bl, %bpl\n", | |
"\tjne\tLBB0_12\n", | |
"\tandb\t%dl, %cl\n", | |
"\tjne\tLBB0_12\n", | |
"\tmovq\t%rax, %r14\n", | |
"\tsubq\t%r11, %r14\n", | |
"\tmovq\t%rax, %rsi\n", | |
"\tandq\t$-8, %rsi\n", | |
"\tleaq\t-8(%rsi), %rcx\n", | |
"\tshrq\t$3, %rcx\n", | |
"\txorl\t%r15d, %r15d\n", | |
"\tincq\t%rcx\n", | |
"\tje\tLBB0_7\n", | |
"\tmovq\t%rcx, %rdx\n", | |
"\tandq\t$1, %rdx\n", | |
"\tje\tLBB0_8\n", | |
"LBB0_7:\n", | |
"\tvmovups\t(%r9), %xmm0\n", | |
"\tvinsertf128\t$1, 16(%r9), %ymm0, %ymm0\n", | |
"\tvmovups\t(%r10), %xmm1\n", | |
"\tvinsertf128\t$1, 16(%r10), %ymm1, %ymm1\n", | |
"\tmovabsq\t$LCPI0_0, %rdx\n", | |
"\tvandps\t(%rdx), %ymm0, %ymm2\n", | |
"\tvmulps\t%ymm2, %ymm1, %ymm1\n", | |
"\tvaddps\t%ymm1, %ymm0, %ymm0\n", | |
"\tvextractf128\t$1, %ymm0, 16(%r8)\n", | |
"\tvmovups\t%xmm0, (%r8)\n", | |
"\tmovl\t$8, %r15d\n", | |
"LBB0_8:\n", | |
"\tcmpq\t$2, %rcx\n", | |
"\tjb\tLBB0_11\n", | |
"\tleaq\t32(%r9,%r15,4), %rdx\n", | |
"\tleaq\t32(%r10,%r15,4), %rbx\n", | |
"\tleaq\t32(%r8,%r15,4), %rcx\n", | |
"\tsubq\t%r15, %rsi\n", | |
"\tmovabsq\t$LCPI0_0, %rbp\n", | |
"\tvmovaps\t(%rbp), %ymm0\n", | |
"\t.align\t4, 0x90\n", | |
"LBB0_10:\n", | |
"\tvmovups\t-32(%rdx), %xmm1\n", | |
"\tvinsertf128\t$1, -16(%rdx), %ymm1, %ymm1\n", | |
"\tvmovups\t-32(%rbx), %xmm2\n", | |
"\tvinsertf128\t$1, -16(%rbx), %ymm2, %ymm2\n", | |
"\tvandps\t%ymm0, %ymm1, %ymm3\n", | |
"\tvmulps\t%ymm3, %ymm2, %ymm2\n", | |
"\tvaddps\t%ymm2, %ymm1, %ymm1\n", | |
"\tvextractf128\t$1, %ymm1, -16(%rcx)\n", | |
"\tvmovups\t%xmm1, -32(%rcx)\n", | |
"\tvmovups\t(%rdx), %xmm1\n", | |
"\tvinsertf128\t$1, 16(%rdx), %ymm1, %ymm1\n", | |
"\tvmovups\t(%rbx), %xmm2\n", | |
"\tvinsertf128\t$1, 16(%rbx), %ymm2, %ymm2\n", | |
"\tvandps\t%ymm0, %ymm1, %ymm3\n", | |
"\tvmulps\t%ymm3, %ymm2, %ymm2\n", | |
"\tvaddps\t%ymm2, %ymm1, %ymm1\n", | |
"\tvextractf128\t$1, %ymm1, 16(%rcx)\n", | |
"\tvmovups\t%xmm1, (%rcx)\n", | |
"\taddq\t$64, %rdx\n", | |
"\taddq\t$64, %rbx\n", | |
"\taddq\t$64, %rcx\n", | |
"\taddq\t$-16, %rsi\n", | |
"\tjne\tLBB0_10\n", | |
"LBB0_11:\n", | |
"\tmovq\t%r11, %rsi\n", | |
"LBB0_12:\n", | |
"\tcmpq\t%rsi, %rax\n", | |
"\tmovq\t%r14, %rax\n", | |
"\tje\tLBB0_19\n", | |
"LBB0_13:\n", | |
"\tmovq\t%rax, %rcx\n", | |
"\tnegq\t%rcx\n", | |
"\tcmpq\t$-2, %rcx\n", | |
"\tmovq\t$-1, %rdx\n", | |
"\tcmovgq\t%rcx, %rdx\n", | |
"\tleaq\t1(%rax,%rdx), %rcx\n", | |
"\ttestq\t%rcx, %rcx\n", | |
"\tje\tLBB0_15\n", | |
"\tmovq\t%rcx, %rdx\n", | |
"\tandq\t$1, %rdx\n", | |
"\tje\tLBB0_16\n", | |
"LBB0_15:\n", | |
"\tdecq\t%rax\n", | |
"\tvmovss\t(%r9,%rsi,4), %xmm0\n", | |
"\tmovabsq\t$LCPI0_1, %rdx\n", | |
"\tvandps\t(%rdx), %xmm0, %xmm1\n", | |
"\tvmulss\t(%r10,%rsi,4), %xmm1, %xmm1\n", | |
"\tvaddss\t%xmm1, %xmm0, %xmm0\n", | |
"\tvmovss\t%xmm0, (%r8,%rsi,4)\n", | |
"\tleaq\t1(%rsi), %rsi\n", | |
"LBB0_16:\n", | |
"\tcmpq\t$2, %rcx\n", | |
"\tjb\tLBB0_19\n", | |
"\tleaq\t4(%r8,%rsi,4), %rcx\n", | |
"\tleaq\t4(%r10,%rsi,4), %rdx\n", | |
"\tleaq\t4(%r9,%rsi,4), %rsi\n", | |
"\tmovabsq\t$LCPI0_1, %rbp\n", | |
"\tvmovss\t(%rbp), %xmm0\n", | |
"\t.align\t4, 0x90\n", | |
"LBB0_18:\n", | |
"\tvmovss\t-4(%rsi), %xmm1\n", | |
"\tvandps\t%xmm0, %xmm1, %xmm2\n", | |
"\tvmulss\t-4(%rdx), %xmm2, %xmm2\n", | |
"\tvaddss\t%xmm2, %xmm1, %xmm1\n", | |
"\tvmovss\t%xmm1, -4(%rcx)\n", | |
"\taddq\t$-2, %rax\n", | |
"\tvmovss\t(%rsi), %xmm1\n", | |
"\tvandps\t%xmm0, %xmm1, %xmm2\n", | |
"\tvmulss\t(%rdx), %xmm2, %xmm2\n", | |
"\tvaddss\t%xmm2, %xmm1, %xmm1\n", | |
"\tvmovss\t%xmm1, (%rcx)\n", | |
"\taddq\t$8, %rcx\n", | |
"\taddq\t$8, %rdx\n", | |
"\taddq\t$8, %rsi\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tjg\tLBB0_18\n", | |
"LBB0_19:\n", | |
"\tmovq\t$0, (%rdi)\n", | |
"\txorl\t%eax, %eax\n", | |
"\tpopq\t%rbx\n", | |
"\tpopq\t%r14\n", | |
"\tpopq\t%r15\n", | |
"\tpopq\t%rbp\n", | |
"\tvzeroupper\n", | |
"\tretq\n", | |
"\n", | |
"\t.section\t__TEXT,__const\n", | |
"\t.align\t5\n", | |
"LCPI1_0:\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.section\t__TEXT,__literal16,16byte_literals\n", | |
"\t.align\t4\n", | |
"LCPI1_1:\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.long\t2147483647\n", | |
"\t.section\t__TEXT,__text,regular,pure_instructions\n", | |
"\t.globl\t\"_wrapper.__main__.do_basic.array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst)\"\n", | |
"\t.align\t4, 0x90\n", | |
"\"_wrapper.__main__.do_basic.array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst).array(float32,_1d,_C,_nonconst)\":\n", | |
"\t.cfi_startproc\n", | |
"\tpushq\t%rbp\n", | |
"Ltmp0:\n", | |
"\t.cfi_def_cfa_offset 16\n", | |
"\tpushq\t%r15\n", | |
"Ltmp1:\n", | |
"\t.cfi_def_cfa_offset 24\n", | |
"\tpushq\t%r14\n", | |
"Ltmp2:\n", | |
"\t.cfi_def_cfa_offset 32\n", | |
"\tpushq\t%r13\n", | |
"Ltmp3:\n", | |
"\t.cfi_def_cfa_offset 40\n", | |
"\tpushq\t%r12\n", | |
"Ltmp4:\n", | |
"\t.cfi_def_cfa_offset 48\n", | |
"\tpushq\t%rbx\n", | |
"Ltmp5:\n", | |
"\t.cfi_def_cfa_offset 56\n", | |
"\tsubq\t$184, %rsp\n", | |
"Ltmp6:\n", | |
"\t.cfi_def_cfa_offset 240\n", | |
"Ltmp7:\n", | |
"\t.cfi_offset %rbx, -56\n", | |
"Ltmp8:\n", | |
"\t.cfi_offset %r12, -48\n", | |
"Ltmp9:\n", | |
"\t.cfi_offset %r13, -40\n", | |
"Ltmp10:\n", | |
"\t.cfi_offset %r14, -32\n", | |
"Ltmp11:\n", | |
"\t.cfi_offset %r15, -24\n", | |
"Ltmp12:\n", | |
"\t.cfi_offset %rbp, -16\n", | |
"\tmovq\t%rdx, %r10\n", | |
"\tleaq\t160(%rsp), %rax\n", | |
"\tmovq\t%rax, (%rsp)\n", | |
"\tmovabsq\t$_.const.OOO, %rdx\n", | |
"\tmovabsq\t$_.kwlist, %rcx\n", | |
"\tmovabsq\t$_PyArg_ParseTupleAndKeywords, %rbp\n", | |
"\tleaq\t176(%rsp), %r8\n", | |
"\tleaq\t168(%rsp), %r9\n", | |
"\txorl\t%eax, %eax\n", | |
"\tmovq\t%rsi, %rdi\n", | |
"\tmovq\t%r10, %rsi\n", | |
"\tcallq\t*%rbp\n", | |
"\ttestl\t%eax, %eax\n", | |
"\tje\tLBB1_1\n", | |
"\tmovq\t176(%rsp), %rdi\n", | |
"\tmovabsq\t$_numba_adapt_ndarray, %rbx\n", | |
"\tleaq\t112(%rsp), %rsi\n", | |
"\tcallq\t*%rbx\n", | |
"\tmovq\t136(%rsp), %r14\n", | |
"\tmovq\t144(%rsp), %r13\n", | |
"\tmovabsq\t$_PyErr_Occurred, %rbp\n", | |
"\tcallq\t*%rbp\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tjne\tLBB1_1\n", | |
"\tmovq\t168(%rsp), %rdi\n", | |
"\tleaq\t64(%rsp), %rsi\n", | |
"\tcallq\t*%rbx\n", | |
"\tmovq\t88(%rsp), %r15\n", | |
"\tcallq\t*%rbp\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tjne\tLBB1_1\n", | |
"\tmovq\t160(%rsp), %rdi\n", | |
"\tleaq\t16(%rsp), %rsi\n", | |
"\tcallq\t*%rbx\n", | |
"\tmovq\t40(%rsp), %r12\n", | |
"\tcallq\t*%rbp\n", | |
"\ttestq\t%rax, %rax\n", | |
"\tje\tLBB1_6\n", | |
"LBB1_1:\n", | |
"\txorl\t%eax, %eax\n", | |
"LBB1_2:\n", | |
"\taddq\t$184, %rsp\n", | |
"\tpopq\t%rbx\n", | |
"\tpopq\t%r12\n", | |
"\tpopq\t%r13\n", | |
"\tpopq\t%r14\n", | |
"\tpopq\t%r15\n", | |
"\tpopq\t%rbp\n", | |
"\tretq\n", | |
"LBB1_6:\n", | |
"\ttestq\t%r13, %r13\n", | |
"\tjle\tLBB1_21\n", | |
"\txorl\t%ecx, %ecx\n", | |
"\ttestq\t%r13, %r13\n", | |
"\tje\tLBB1_15\n", | |
"\txorl\t%ecx, %ecx\n", | |
"\tmovq\t%r13, %rbp\n", | |
"\tandq\t$-16, %rbp\n", | |
"\tje\tLBB1_14\n", | |
"\tleaq\t-1(%r13), %rcx\n", | |
"\tleaq\t-4(%r12,%r13,4), %rsi\n", | |
"\tleaq\t(%r14,%rcx,4), %rdx\n", | |
"\tcmpq\t%rdx, %r12\n", | |
"\tsetbe\t%al\n", | |
"\tcmpq\t%rsi, %r14\n", | |
"\tsetbe\t%bl\n", | |
"\tleaq\t(%r15,%rcx,4), %rcx\n", | |
"\tcmpq\t%rcx, %r12\n", | |
"\tsetbe\t%dl\n", | |
"\tcmpq\t%rsi, %r15\n", | |
"\tsetbe\t%sil\n", | |
"\txorl\t%ecx, %ecx\n", | |
"\ttestb\t%bl, %al\n", | |
"\tjne\tLBB1_14\n", | |
"\tandb\t%sil, %dl\n", | |
"\tjne\tLBB1_14\n", | |
"\tleaq\t32(%r12), %rcx\n", | |
"\tleaq\t32(%r15), %rdx\n", | |
"\tleaq\t32(%r14), %rsi\n", | |
"\tmovq\t%r13, %rdi\n", | |
"\tandq\t$-16, %rdi\n", | |
"\tmovabsq\t$LCPI1_0, %rax\n", | |
"\tvmovaps\t(%rax), %ymm0\n", | |
"LBB1_12:\n", | |
"\tvmovups\t-32(%rsi), %xmm1\n", | |
"\tvmovups\t(%rsi), %xmm2\n", | |
"\tvinsertf128\t$1, -16(%rsi), %ymm1, %ymm1\n", | |
"\tvinsertf128\t$1, 16(%rsi), %ymm2, %ymm2\n", | |
"\tvmovups\t-32(%rdx), %xmm3\n", | |
"\tvmovups\t(%rdx), %xmm4\n", | |
"\tvinsertf128\t$1, -16(%rdx), %ymm3, %ymm3\n", | |
"\tvinsertf128\t$1, 16(%rdx), %ymm4, %ymm4\n", | |
"\tvandps\t%ymm0, %ymm1, %ymm5\n", | |
"\tvandps\t%ymm0, %ymm2, %ymm6\n", | |
"\tvmulps\t%ymm5, %ymm3, %ymm3\n", | |
"\tvmulps\t%ymm6, %ymm4, %ymm4\n", | |
"\tvaddps\t%ymm3, %ymm1, %ymm1\n", | |
"\tvaddps\t%ymm4, %ymm2, %ymm2\n", | |
"\tvextractf128\t$1, %ymm1, -16(%rcx)\n", | |
"\tvmovups\t%xmm1, -32(%rcx)\n", | |
"\tvextractf128\t$1, %ymm2, 16(%rcx)\n", | |
"\tvmovups\t%xmm2, (%rcx)\n", | |
"\taddq\t$64, %rcx\n", | |
"\taddq\t$64, %rdx\n", | |
"\taddq\t$64, %rsi\n", | |
"\taddq\t$-16, %rdi\n", | |
"\tjne\tLBB1_12\n", | |
"\tmovq\t%rbp, %rcx\n", | |
"LBB1_14:\n", | |
"\tcmpq\t%rcx, %r13\n", | |
"\tje\tLBB1_21\n", | |
"LBB1_15:\n", | |
"\tmovq\t%r13, %rdx\n", | |
"\tsubq\t%rcx, %rdx\n", | |
"\tmovq\t%r13, %rax\n", | |
"\tsubq\t%rcx, %rax\n", | |
"\tje\tLBB1_17\n", | |
"\tandq\t$1, %rdx\n", | |
"\tje\tLBB1_18\n", | |
"LBB1_17:\n", | |
"\tvmovss\t(%r14,%rcx,4), %xmm0\n", | |
"\tmovabsq\t$LCPI1_1, %rdx\n", | |
"\tvandps\t(%rdx), %xmm0, %xmm1\n", | |
"\tvmulss\t(%r15,%rcx,4), %xmm1, %xmm1\n", | |
"\tvaddss\t%xmm1, %xmm0, %xmm0\n", | |
"\tvmovss\t%xmm0, (%r12,%rcx,4)\n", | |
"\tleaq\t1(%rcx), %rcx\n", | |
"LBB1_18:\n", | |
"\tcmpq\t$2, %rax\n", | |
"\tjb\tLBB1_21\n", | |
"\tleaq\t4(%r12,%rcx,4), %rax\n", | |
"\tleaq\t4(%r15,%rcx,4), %rdx\n", | |
"\tleaq\t4(%r14,%rcx,4), %rsi\n", | |
"\tsubq\t%rcx, %r13\n", | |
"\tmovabsq\t$LCPI1_1, %rcx\n", | |
"\tvmovss\t(%rcx), %xmm0\n", | |
"\t.align\t4, 0x90\n", | |
"LBB1_20:\n", | |
"\tvmovss\t-4(%rsi), %xmm1\n", | |
"\tvandps\t%xmm0, %xmm1, %xmm2\n", | |
"\tvmulss\t-4(%rdx), %xmm2, %xmm2\n", | |
"\tvaddss\t%xmm2, %xmm1, %xmm1\n", | |
"\tvmovss\t%xmm1, -4(%rax)\n", | |
"\tvmovss\t(%rsi), %xmm1\n", | |
"\tvandps\t%xmm0, %xmm1, %xmm2\n", | |
"\tvmulss\t(%rdx), %xmm2, %xmm2\n", | |
"\tvaddss\t%xmm2, %xmm1, %xmm1\n", | |
"\tvmovss\t%xmm1, (%rax)\n", | |
"\taddq\t$8, %rax\n", | |
"\taddq\t$8, %rdx\n", | |
"\taddq\t$8, %rsi\n", | |
"\taddq\t$-2, %r13\n", | |
"\tjne\tLBB1_20\n", | |
"LBB1_21:\n", | |
"\tmovabsq\t$_Py_None, %rax\n", | |
"\tmovq\t(%rax), %rbx\n", | |
"\tmovabsq\t$_Py_IncRef, %rax\n", | |
"\tmovq\t%rbx, %rdi\n", | |
"\tvzeroupper\n", | |
"\tcallq\t*%rax\n", | |
"\tmovq\t%rbx, %rax\n", | |
"\tjmp\tLBB1_2\n", | |
"\t.cfi_endproc\n", | |
"\n", | |
"\t.section\t__TEXT,__const\n", | |
"_.const.a:\n", | |
"\t.asciz\t\"a\"\n", | |
"\n", | |
"_.const.b:\n", | |
"\t.asciz\t\"b\"\n", | |
"\n", | |
"_.const.out:\n", | |
"\t.asciz\t\"out\"\n", | |
"\n", | |
"\t.section\t__DATA,__const\n", | |
"\t.align\t4\n", | |
"_.kwlist:\n", | |
"\t.quad\t_.const.a\n", | |
"\t.quad\t_.const.b\n", | |
"\t.quad\t_.const.out\n", | |
"\t.quad\t0\n", | |
"\n", | |
"\t.section\t__TEXT,__const\n", | |
"_.const.OOO:\n", | |
"\t.asciz\t\"OOO\"\n", | |
"\n", | |
"\n", | |
".subsections_via_symbols\n", | |
"\n", | |
"================================================================================\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print('Basic:')\n", | |
"%timeit a + b * np.fabs(a)\n", | |
"%timeit do_basic(a, b, out)\n", | |
"print('cos():')\n", | |
"%timeit np.cos(a, out=out)\n", | |
"%timeit do_numpy_cos(a, out)\n", | |
"%timeit do_math_cos(a, out)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Basic:\n", | |
"10000 loops, best of 3: 31.4 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"100000 loops, best of 3: 2.8 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"cos():\n", | |
"10000 loops, best of 3: 95.8 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"10000 loops, best of 3: 87.6 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"10000 loops, best of 3: 78.1 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Higher order polynomials" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def gen_poly(order=1):\n", | |
" @numba.jit\n", | |
" def do_math(a, b, out):\n", | |
" for i in range(a.shape[0]):\n", | |
" out[i] = 1.0\n", | |
" for j in range(order):\n", | |
" out[i] += a[i] + (out[i] * b[i])\n", | |
" return do_math" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"poly_1 = gen_poly(1)\n", | |
"poly_5 = gen_poly(5)\n", | |
"poly_10 = gen_poly(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"poly_1(a, b, out)\n", | |
"poly_5(a, b, out)\n", | |
"poly_10(a, b, out)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%timeit poly_1(a, b, out)\n", | |
"%timeit poly_5(a, b, out)\n", | |
"%timeit poly_10(a, b, out)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"100000 loops, best of 3: 2.78 \u00b5s per loop\n", | |
"100000 loops, best of 3: 9.32 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"10000 loops, best of 3: 22.1 \u00b5s per loop" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print('Time per element per polynomial iteration:')\n", | |
"# Copy runtimes from previous cell\n", | |
"[2.78, 9.32, 22.1] / np.array([1.0, 5.0, 10.0])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Time per element per polynomial iteration:\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"array([ 2.78 , 1.864, 2.21 ])" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment