Created
December 20, 2016 21:29
-
-
Save maedoc/f31994da1fcc862d42d3a5649a040965 to your computer and use it in GitHub Desktop.
testing autovec in numba
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.file "f" | |
.globl "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)" | |
.align 16, 0x90 | |
.type "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)",@function | |
"__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)": | |
.cfi_startproc | |
pushq %r14 | |
.Ltmp0: | |
.cfi_def_cfa_offset 16 | |
pushq %rsi | |
.Ltmp1: | |
.cfi_def_cfa_offset 24 | |
pushq %rdi | |
.Ltmp2: | |
.cfi_def_cfa_offset 32 | |
pushq %rbx | |
.Ltmp3: | |
.cfi_def_cfa_offset 40 | |
subq $72, %rsp | |
vmovaps %xmm7, 48(%rsp) | |
vmovaps %xmm6, 32(%rsp) | |
.Ltmp4: | |
.cfi_def_cfa_offset 112 | |
.Ltmp5: | |
.cfi_offset %rbx, -40 | |
.Ltmp6: | |
.cfi_offset %rdi, -32 | |
.Ltmp7: | |
.cfi_offset %rsi, -24 | |
.Ltmp8: | |
.cfi_offset %r14, -16 | |
.Ltmp9: | |
.cfi_offset %xmm6, -80 | |
.Ltmp10: | |
.cfi_offset %xmm7, -64 | |
movq %r9, %rdi | |
movq %rcx, %r14 | |
movq 192(%rsp), %rcx | |
testq %rdi, %rdi | |
je .LBB0_2 | |
lock addq $1, (%rdi) | |
.LBB0_2: | |
movq 224(%rsp), %rax | |
movq 168(%rsp), %rsi | |
vxorps %xmm0, %xmm0, %xmm0 | |
movl $9, %ebx | |
testq %rcx, %rcx | |
je .LBB0_4 | |
lock addq $1, (%rcx) | |
.LBB0_4: | |
vxorps %xmm6, %xmm6, %xmm6 | |
.align 16, 0x90 | |
.LBB0_5: | |
vmovss (%rax), %xmm2 | |
vucomiss %xmm0, %xmm2 | |
jne .LBB0_15 | |
jp .LBB0_15 | |
jmp .LBB0_6 | |
.LBB0_15: | |
vmovss (%rsi), %xmm4 | |
vsubss %xmm2, %xmm4, %xmm1 | |
vdivss %xmm2, %xmm1, %xmm1 | |
vxorps %xmm3, %xmm3, %xmm3 | |
vucomiss %xmm3, %xmm1 | |
jne .LBB0_16 | |
jnp .LBB0_6 | |
.LBB0_16: | |
vaddss %xmm2, %xmm4, %xmm5 | |
vmulss %xmm5, %xmm4, %xmm5 | |
vmulss %xmm2, %xmm4, %xmm7 | |
vaddss %xmm7, %xmm4, %xmm2 | |
vmulss %xmm5, %xmm7, %xmm4 | |
vaddss %xmm4, %xmm1, %xmm4 | |
vsubss %xmm4, %xmm2, %xmm7 | |
vucomiss %xmm3, %xmm7 | |
jne .LBB0_17 | |
jnp .LBB0_6 | |
.LBB0_17: | |
vsubss %xmm1, %xmm5, %xmm3 | |
vdivss %xmm1, %xmm5, %xmm1 | |
vmulss %xmm1, %xmm2, %xmm1 | |
vsubss %xmm4, %xmm1, %xmm1 | |
vdivss %xmm7, %xmm3, %xmm2 | |
vmulss %xmm2, %xmm1, %xmm1 | |
vaddss %xmm1, %xmm6, %xmm6 | |
addq $4, %rsi | |
addq $4, %rax | |
addl $-1, %ebx | |
cmpl $1, %ebx | |
jg .LBB0_5 | |
testq %rcx, %rcx | |
je .LBB0_10 | |
movq $-1, %rax | |
lock xaddq %rax, (%rcx) | |
cmpq $1, %rax | |
je .LBB0_9 | |
.LBB0_10: | |
testq %rdi, %rdi | |
je .LBB0_13 | |
movq $-1, %rax | |
lock xaddq %rax, (%rdi) | |
cmpq $1, %rax | |
je .LBB0_12 | |
.LBB0_13: | |
vmovss %xmm6, (%r14) | |
xorl %eax, %eax | |
jmp .LBB0_14 | |
.LBB0_6: | |
movabsq $.const.picklebuf.2671449003720, %rax | |
movq %rax, (%rdx) | |
movl $1, %eax | |
.LBB0_14: | |
vmovaps 32(%rsp), %xmm6 | |
vmovaps 48(%rsp), %xmm7 | |
addq $72, %rsp | |
popq %rbx | |
popq %rdi | |
popq %rsi | |
popq %r14 | |
retq | |
.LBB0_9: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
callq *%rax | |
jmp .LBB0_10 | |
.LBB0_12: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %rdi, %rcx | |
callq *%rax | |
jmp .LBB0_13 | |
.Lfunc_end0: | |
.size "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)", .Lfunc_end0-"__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)" | |
.cfi_endproc | |
.globl "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)" | |
.align 16, 0x90 | |
.type "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)",@function | |
"cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)": | |
.cfi_startproc | |
pushq %rbp | |
.Ltmp11: | |
.cfi_def_cfa_offset 16 | |
.Ltmp12: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
.Ltmp13: | |
.cfi_def_cfa_register %rbp | |
pushq %r15 | |
pushq %r14 | |
pushq %rsi | |
pushq %rdi | |
pushq %rbx | |
andq $-32, %rsp | |
subq $288, %rsp | |
vmovaps %xmm7, -64(%rbp) | |
vmovaps %xmm6, -80(%rbp) | |
.Ltmp14: | |
.cfi_offset %rbx, -56 | |
.Ltmp15: | |
.cfi_offset %rdi, -48 | |
.Ltmp16: | |
.cfi_offset %rsi, -40 | |
.Ltmp17: | |
.cfi_offset %r14, -32 | |
.Ltmp18: | |
.cfi_offset %r15, -24 | |
.Ltmp19: | |
.cfi_offset %xmm6, -96 | |
.Ltmp20: | |
.cfi_offset %xmm7, -80 | |
movq %rcx, %rsi | |
leaq 208(%rsp), %rax | |
movq %rax, 40(%rsp) | |
leaq 216(%rsp), %rax | |
movq %rax, 32(%rsp) | |
movabsq $.const.f, %rax | |
movabsq $PyArg_UnpackTuple, %rbx | |
movl $2, %r8d | |
movl $2, %r9d | |
movq %rdx, %rcx | |
movq %rax, %rdx | |
callq *%rbx | |
vxorps %ymm0, %ymm0, %ymm0 | |
vmovups %ymm0, 152(%rsp) | |
vmovaps %ymm0, 128(%rsp) | |
vmovups %ymm0, 88(%rsp) | |
vmovaps %ymm0, 64(%rsp) | |
testl %eax, %eax | |
je .LBB1_1 | |
testq %rsi, %rsi | |
je .LBB1_38 | |
cmpq $0, 24(%rsi) | |
je .LBB1_5 | |
movq 216(%rsp), %rcx | |
movabsq $NRT_adapt_ndarray_from_python, %rsi | |
leaq 128(%rsp), %rdx | |
vzeroupper | |
callq *%rsi | |
testl %eax, %eax | |
jne .LBB1_1 | |
movq 128(%rsp), %r15 | |
movq 160(%rsp), %rbx | |
movq 208(%rsp), %rcx | |
leaq 64(%rsp), %rdx | |
callq *%rsi | |
testl %eax, %eax | |
jne .LBB1_8 | |
movq 64(%rsp), %rsi | |
movq 96(%rsp), %rdi | |
movabsq $PyEval_SaveThread, %rax | |
callq *%rax | |
movq %rax, %r14 | |
testq %r15, %r15 | |
je .LBB1_13 | |
lock addq $1, (%r15) | |
.LBB1_13: | |
vxorps %xmm0, %xmm0, %xmm0 | |
movl $9, %eax | |
testq %rsi, %rsi | |
je .LBB1_15 | |
lock addq $1, (%rsi) | |
.LBB1_15: | |
vxorps %xmm6, %xmm6, %xmm6 | |
.align 16, 0x90 | |
.LBB1_16: | |
vmovss (%rdi), %xmm2 | |
vucomiss %xmm0, %xmm2 | |
jne .LBB1_17 | |
jnp .LBB1_28 | |
.LBB1_17: | |
vmovss (%rbx), %xmm4 | |
vsubss %xmm2, %xmm4, %xmm1 | |
vdivss %xmm2, %xmm1, %xmm1 | |
vxorps %xmm3, %xmm3, %xmm3 | |
vucomiss %xmm3, %xmm1 | |
jne .LBB1_26 | |
jnp .LBB1_28 | |
jmp .LBB1_26 | |
.LBB1_26: | |
vaddss %xmm2, %xmm4, %xmm5 | |
vmulss %xmm5, %xmm4, %xmm5 | |
vmulss %xmm2, %xmm4, %xmm7 | |
vaddss %xmm7, %xmm4, %xmm2 | |
vmulss %xmm5, %xmm7, %xmm4 | |
vaddss %xmm4, %xmm1, %xmm4 | |
vsubss %xmm4, %xmm2, %xmm7 | |
vucomiss %xmm3, %xmm7 | |
jne .LBB1_27 | |
jnp .LBB1_28 | |
.LBB1_27: | |
vsubss %xmm1, %xmm5, %xmm3 | |
vdivss %xmm1, %xmm5, %xmm1 | |
vmulss %xmm1, %xmm2, %xmm1 | |
vsubss %xmm4, %xmm1, %xmm1 | |
vdivss %xmm7, %xmm3, %xmm2 | |
vmulss %xmm2, %xmm1, %xmm1 | |
vaddss %xmm1, %xmm6, %xmm6 | |
addq $4, %rdi | |
addq $4, %rbx | |
addl $-1, %eax | |
cmpl $1, %eax | |
jg .LBB1_16 | |
testq %rsi, %rsi | |
je .LBB1_21 | |
movq $-1, %rax | |
lock xaddq %rax, (%rsi) | |
cmpq $1, %rax | |
je .LBB1_20 | |
.LBB1_21: | |
testq %r15, %r15 | |
je .LBB1_22 | |
movq $-1, %rax | |
lock xaddq %rax, (%r15) | |
cmpq $1, %rax | |
je .LBB1_24 | |
.LBB1_25: | |
movabsq $PyEval_RestoreThread, %rax | |
movq %r14, %rcx | |
callq *%rax | |
xorl %edi, %edi | |
jmp .LBB1_29 | |
.LBB1_28: | |
movabsq $PyEval_RestoreThread, %rax | |
movq %r14, %rcx | |
callq *%rax | |
vxorps %xmm6, %xmm6, %xmm6 | |
movl $1, %edi | |
testq %r15, %r15 | |
je .LBB1_31 | |
.LBB1_29: | |
movq $-1, %rax | |
lock xaddq %rax, (%r15) | |
cmpq $1, %rax | |
je .LBB1_30 | |
.LBB1_31: | |
testq %rsi, %rsi | |
je .LBB1_34 | |
movq $-1, %rax | |
lock xaddq %rax, (%rsi) | |
cmpq $1, %rax | |
je .LBB1_33 | |
.LBB1_34: | |
testl %edi, %edi | |
je .LBB1_35 | |
movabsq $PyErr_Clear, %rax | |
callq *%rax | |
movabsq $.const.pickledata.2671449003720, %rcx | |
movabsq $numba_unpickle, %rax | |
movl $68, %edx | |
callq *%rax | |
testq %rax, %rax | |
je .LBB1_1 | |
movabsq $numba_do_raise, %rdx | |
movq %rax, %rcx | |
callq *%rdx | |
.LBB1_1: | |
xorl %eax, %eax | |
jmp .LBB1_2 | |
.LBB1_35: | |
vcvtss2sd %xmm6, %xmm6, %xmm0 | |
movabsq $PyFloat_FromDouble, %rax | |
callq *%rax | |
.LBB1_2: | |
vmovaps -80(%rbp), %xmm6 | |
vmovaps -64(%rbp), %xmm7 | |
leaq -40(%rbp), %rsp | |
popq %rbx | |
popq %rdi | |
popq %rsi | |
popq %r14 | |
popq %r15 | |
popq %rbp | |
vzeroupper | |
retq | |
.LBB1_5: | |
movabsq $PyExc_RuntimeError, %rcx | |
movabsq $".const.missing Environment", %rdx | |
movabsq $PyErr_SetString, %rax | |
vzeroupper | |
callq *%rax | |
jmp .LBB1_1 | |
.LBB1_8: | |
testq %r15, %r15 | |
je .LBB1_1 | |
movq $-1, %rax | |
lock xaddq %rax, (%r15) | |
cmpq $1, %rax | |
jne .LBB1_1 | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %r15, %rcx | |
callq *%rax | |
jmp .LBB1_1 | |
.LBB1_30: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %r15, %rcx | |
callq *%rax | |
jmp .LBB1_31 | |
.LBB1_33: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %rsi, %rcx | |
callq *%rax | |
jmp .LBB1_34 | |
.LBB1_22: | |
movabsq $PyEval_RestoreThread, %rax | |
movq %r14, %rcx | |
callq *%rax | |
xorl %edi, %edi | |
jmp .LBB1_31 | |
.LBB1_20: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %rsi, %rcx | |
callq *%rax | |
jmp .LBB1_21 | |
.LBB1_24: | |
movabsq $NRT_MemInfo_call_dtor, %rax | |
movq %r15, %rcx | |
callq *%rax | |
jmp .LBB1_25 | |
.LBB1_38: | |
movabsq $".const.Fatal error: missing _dynfunc.Closure", %rcx | |
movabsq $puts, %rax | |
vzeroupper | |
callq *%rax | |
ud2 | |
.Lfunc_end1: | |
.size "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)", .Lfunc_end1-"cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)" | |
.cfi_endproc | |
.type .const.picklebuf.2671449003720,@object | |
.section .data.rel.ro,"aw",@progbits | |
.align 8 | |
.const.picklebuf.2671449003720: | |
.quad .const.pickledata.2671449003720 | |
.long 68 | |
.zero 4 | |
.size .const.picklebuf.2671449003720, 16 | |
.type .const.pickledata.2671449003720,@object | |
.section .rodata,"a",@progbits | |
.align 16 | |
.const.pickledata.2671449003720: | |
.ascii "\200\004\2259\000\000\000\000\000\000\000\214\bbuiltins\224\214\021ZeroDivisionError\224\223\224\214\020division by zero\224\205\224\206\224." | |
.size .const.pickledata.2671449003720, 68 | |
.type .const.f,@object | |
.const.f: | |
.asciz "f" | |
.size .const.f, 2 | |
.type ".const.Fatal error: missing _dynfunc.Closure",@object | |
.align 16 | |
".const.Fatal error: missing _dynfunc.Closure": | |
.asciz "Fatal error: missing _dynfunc.Closure" | |
.size ".const.Fatal error: missing _dynfunc.Closure", 38 | |
.type ".const.missing Environment",@object | |
.align 16 | |
".const.missing Environment": | |
.asciz "missing Environment" | |
.size ".const.missing Environment", 20 | |
.section ".note.GNU-stack","",@progbits |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numba import * | |
dtype = float32 | |
width = int32(8) | |
@jit(nopython=True, nogil=True) | |
def f(x, y): | |
acc = dtype(0.0) | |
for i in range(width): | |
a = (x[i] + y[i]) * x[i] | |
b = (x[i] - y[i]) / y[i] | |
c = x[i] * y[i] + x[i] | |
d = x[i] * y[i] * a + b | |
e = a / b * c - d | |
acc += (a - b) / (c - d) * e | |
return acc | |
import numpy as np | |
a, b = np.r_[:2*width].astype( | |
getattr(np, dtype.name)).reshape((2, 8)) | |
c = f(a, b) | |
_, = f.inspect_asm().values() | |
with open('test-numba-autovec.asm', 'w') as fd: | |
fd.write(_) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment