Skip to content

Instantly share code, notes, and snippets.

@maedoc
Created December 20, 2016 21:29
Show Gist options
  • Save maedoc/f31994da1fcc862d42d3a5649a040965 to your computer and use it in GitHub Desktop.
Save maedoc/f31994da1fcc862d42d3a5649a040965 to your computer and use it in GitHub Desktop.
testing autovec in numba
.text
.file "f"
.globl "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)"
.align 16, 0x90
.type "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)",@function
"__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)":
.cfi_startproc
pushq %r14
.Ltmp0:
.cfi_def_cfa_offset 16
pushq %rsi
.Ltmp1:
.cfi_def_cfa_offset 24
pushq %rdi
.Ltmp2:
.cfi_def_cfa_offset 32
pushq %rbx
.Ltmp3:
.cfi_def_cfa_offset 40
subq $72, %rsp
vmovaps %xmm7, 48(%rsp)
vmovaps %xmm6, 32(%rsp)
.Ltmp4:
.cfi_def_cfa_offset 112
.Ltmp5:
.cfi_offset %rbx, -40
.Ltmp6:
.cfi_offset %rdi, -32
.Ltmp7:
.cfi_offset %rsi, -24
.Ltmp8:
.cfi_offset %r14, -16
.Ltmp9:
.cfi_offset %xmm6, -80
.Ltmp10:
.cfi_offset %xmm7, -64
movq %r9, %rdi
movq %rcx, %r14
movq 192(%rsp), %rcx
testq %rdi, %rdi
je .LBB0_2
lock addq $1, (%rdi)
.LBB0_2:
movq 224(%rsp), %rax
movq 168(%rsp), %rsi
vxorps %xmm0, %xmm0, %xmm0
movl $9, %ebx
testq %rcx, %rcx
je .LBB0_4
lock addq $1, (%rcx)
.LBB0_4:
vxorps %xmm6, %xmm6, %xmm6
.align 16, 0x90
.LBB0_5:
vmovss (%rax), %xmm2
vucomiss %xmm0, %xmm2
jne .LBB0_15
jp .LBB0_15
jmp .LBB0_6
.LBB0_15:
vmovss (%rsi), %xmm4
vsubss %xmm2, %xmm4, %xmm1
vdivss %xmm2, %xmm1, %xmm1
vxorps %xmm3, %xmm3, %xmm3
vucomiss %xmm3, %xmm1
jne .LBB0_16
jnp .LBB0_6
.LBB0_16:
vaddss %xmm2, %xmm4, %xmm5
vmulss %xmm5, %xmm4, %xmm5
vmulss %xmm2, %xmm4, %xmm7
vaddss %xmm7, %xmm4, %xmm2
vmulss %xmm5, %xmm7, %xmm4
vaddss %xmm4, %xmm1, %xmm4
vsubss %xmm4, %xmm2, %xmm7
vucomiss %xmm3, %xmm7
jne .LBB0_17
jnp .LBB0_6
.LBB0_17:
vsubss %xmm1, %xmm5, %xmm3
vdivss %xmm1, %xmm5, %xmm1
vmulss %xmm1, %xmm2, %xmm1
vsubss %xmm4, %xmm1, %xmm1
vdivss %xmm7, %xmm3, %xmm2
vmulss %xmm2, %xmm1, %xmm1
vaddss %xmm1, %xmm6, %xmm6
addq $4, %rsi
addq $4, %rax
addl $-1, %ebx
cmpl $1, %ebx
jg .LBB0_5
testq %rcx, %rcx
je .LBB0_10
movq $-1, %rax
lock xaddq %rax, (%rcx)
cmpq $1, %rax
je .LBB0_9
.LBB0_10:
testq %rdi, %rdi
je .LBB0_13
movq $-1, %rax
lock xaddq %rax, (%rdi)
cmpq $1, %rax
je .LBB0_12
.LBB0_13:
vmovss %xmm6, (%r14)
xorl %eax, %eax
jmp .LBB0_14
.LBB0_6:
movabsq $.const.picklebuf.2671449003720, %rax
movq %rax, (%rdx)
movl $1, %eax
.LBB0_14:
vmovaps 32(%rsp), %xmm6
vmovaps 48(%rsp), %xmm7
addq $72, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
retq
.LBB0_9:
movabsq $NRT_MemInfo_call_dtor, %rax
callq *%rax
jmp .LBB0_10
.LBB0_12:
movabsq $NRT_MemInfo_call_dtor, %rax
movq %rdi, %rcx
callq *%rax
jmp .LBB0_13
.Lfunc_end0:
.size "__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)", .Lfunc_end0-"__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)"
.cfi_endproc
.globl "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)"
.align 16, 0x90
.type "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)",@function
"cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)":
.cfi_startproc
pushq %rbp
.Ltmp11:
.cfi_def_cfa_offset 16
.Ltmp12:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp13:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %rsi
pushq %rdi
pushq %rbx
andq $-32, %rsp
subq $288, %rsp
vmovaps %xmm7, -64(%rbp)
vmovaps %xmm6, -80(%rbp)
.Ltmp14:
.cfi_offset %rbx, -56
.Ltmp15:
.cfi_offset %rdi, -48
.Ltmp16:
.cfi_offset %rsi, -40
.Ltmp17:
.cfi_offset %r14, -32
.Ltmp18:
.cfi_offset %r15, -24
.Ltmp19:
.cfi_offset %xmm6, -96
.Ltmp20:
.cfi_offset %xmm7, -80
movq %rcx, %rsi
leaq 208(%rsp), %rax
movq %rax, 40(%rsp)
leaq 216(%rsp), %rax
movq %rax, 32(%rsp)
movabsq $.const.f, %rax
movabsq $PyArg_UnpackTuple, %rbx
movl $2, %r8d
movl $2, %r9d
movq %rdx, %rcx
movq %rax, %rdx
callq *%rbx
vxorps %ymm0, %ymm0, %ymm0
vmovups %ymm0, 152(%rsp)
vmovaps %ymm0, 128(%rsp)
vmovups %ymm0, 88(%rsp)
vmovaps %ymm0, 64(%rsp)
testl %eax, %eax
je .LBB1_1
testq %rsi, %rsi
je .LBB1_38
cmpq $0, 24(%rsi)
je .LBB1_5
movq 216(%rsp), %rcx
movabsq $NRT_adapt_ndarray_from_python, %rsi
leaq 128(%rsp), %rdx
vzeroupper
callq *%rsi
testl %eax, %eax
jne .LBB1_1
movq 128(%rsp), %r15
movq 160(%rsp), %rbx
movq 208(%rsp), %rcx
leaq 64(%rsp), %rdx
callq *%rsi
testl %eax, %eax
jne .LBB1_8
movq 64(%rsp), %rsi
movq 96(%rsp), %rdi
movabsq $PyEval_SaveThread, %rax
callq *%rax
movq %rax, %r14
testq %r15, %r15
je .LBB1_13
lock addq $1, (%r15)
.LBB1_13:
vxorps %xmm0, %xmm0, %xmm0
movl $9, %eax
testq %rsi, %rsi
je .LBB1_15
lock addq $1, (%rsi)
.LBB1_15:
vxorps %xmm6, %xmm6, %xmm6
.align 16, 0x90
.LBB1_16:
vmovss (%rdi), %xmm2
vucomiss %xmm0, %xmm2
jne .LBB1_17
jnp .LBB1_28
.LBB1_17:
vmovss (%rbx), %xmm4
vsubss %xmm2, %xmm4, %xmm1
vdivss %xmm2, %xmm1, %xmm1
vxorps %xmm3, %xmm3, %xmm3
vucomiss %xmm3, %xmm1
jne .LBB1_26
jnp .LBB1_28
jmp .LBB1_26
.LBB1_26:
vaddss %xmm2, %xmm4, %xmm5
vmulss %xmm5, %xmm4, %xmm5
vmulss %xmm2, %xmm4, %xmm7
vaddss %xmm7, %xmm4, %xmm2
vmulss %xmm5, %xmm7, %xmm4
vaddss %xmm4, %xmm1, %xmm4
vsubss %xmm4, %xmm2, %xmm7
vucomiss %xmm3, %xmm7
jne .LBB1_27
jnp .LBB1_28
.LBB1_27:
vsubss %xmm1, %xmm5, %xmm3
vdivss %xmm1, %xmm5, %xmm1
vmulss %xmm1, %xmm2, %xmm1
vsubss %xmm4, %xmm1, %xmm1
vdivss %xmm7, %xmm3, %xmm2
vmulss %xmm2, %xmm1, %xmm1
vaddss %xmm1, %xmm6, %xmm6
addq $4, %rdi
addq $4, %rbx
addl $-1, %eax
cmpl $1, %eax
jg .LBB1_16
testq %rsi, %rsi
je .LBB1_21
movq $-1, %rax
lock xaddq %rax, (%rsi)
cmpq $1, %rax
je .LBB1_20
.LBB1_21:
testq %r15, %r15
je .LBB1_22
movq $-1, %rax
lock xaddq %rax, (%r15)
cmpq $1, %rax
je .LBB1_24
.LBB1_25:
movabsq $PyEval_RestoreThread, %rax
movq %r14, %rcx
callq *%rax
xorl %edi, %edi
jmp .LBB1_29
.LBB1_28:
movabsq $PyEval_RestoreThread, %rax
movq %r14, %rcx
callq *%rax
vxorps %xmm6, %xmm6, %xmm6
movl $1, %edi
testq %r15, %r15
je .LBB1_31
.LBB1_29:
movq $-1, %rax
lock xaddq %rax, (%r15)
cmpq $1, %rax
je .LBB1_30
.LBB1_31:
testq %rsi, %rsi
je .LBB1_34
movq $-1, %rax
lock xaddq %rax, (%rsi)
cmpq $1, %rax
je .LBB1_33
.LBB1_34:
testl %edi, %edi
je .LBB1_35
movabsq $PyErr_Clear, %rax
callq *%rax
movabsq $.const.pickledata.2671449003720, %rcx
movabsq $numba_unpickle, %rax
movl $68, %edx
callq *%rax
testq %rax, %rax
je .LBB1_1
movabsq $numba_do_raise, %rdx
movq %rax, %rcx
callq *%rdx
.LBB1_1:
xorl %eax, %eax
jmp .LBB1_2
.LBB1_35:
vcvtss2sd %xmm6, %xmm6, %xmm0
movabsq $PyFloat_FromDouble, %rax
callq *%rax
.LBB1_2:
vmovaps -80(%rbp), %xmm6
vmovaps -64(%rbp), %xmm7
leaq -40(%rbp), %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.LBB1_5:
movabsq $PyExc_RuntimeError, %rcx
movabsq $".const.missing Environment", %rdx
movabsq $PyErr_SetString, %rax
vzeroupper
callq *%rax
jmp .LBB1_1
.LBB1_8:
testq %r15, %r15
je .LBB1_1
movq $-1, %rax
lock xaddq %rax, (%r15)
cmpq $1, %rax
jne .LBB1_1
movabsq $NRT_MemInfo_call_dtor, %rax
movq %r15, %rcx
callq *%rax
jmp .LBB1_1
.LBB1_30:
movabsq $NRT_MemInfo_call_dtor, %rax
movq %r15, %rcx
callq *%rax
jmp .LBB1_31
.LBB1_33:
movabsq $NRT_MemInfo_call_dtor, %rax
movq %rsi, %rcx
callq *%rax
jmp .LBB1_34
.LBB1_22:
movabsq $PyEval_RestoreThread, %rax
movq %r14, %rcx
callq *%rax
xorl %edi, %edi
jmp .LBB1_31
.LBB1_20:
movabsq $NRT_MemInfo_call_dtor, %rax
movq %rsi, %rcx
callq *%rax
jmp .LBB1_21
.LBB1_24:
movabsq $NRT_MemInfo_call_dtor, %rax
movq %r15, %rcx
callq *%rax
jmp .LBB1_25
.LBB1_38:
movabsq $".const.Fatal error: missing _dynfunc.Closure", %rcx
movabsq $puts, %rax
vzeroupper
callq *%rax
ud2
.Lfunc_end1:
.size "cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)", .Lfunc_end1-"cpython.__main__.f$3.array(float32,_1d,_C).array(float32,_1d,_C)"
.cfi_endproc
.type .const.picklebuf.2671449003720,@object
.section .data.rel.ro,"aw",@progbits
.align 8
.const.picklebuf.2671449003720:
.quad .const.pickledata.2671449003720
.long 68
.zero 4
.size .const.picklebuf.2671449003720, 16
.type .const.pickledata.2671449003720,@object
.section .rodata,"a",@progbits
.align 16
.const.pickledata.2671449003720:
.ascii "\200\004\2259\000\000\000\000\000\000\000\214\bbuiltins\224\214\021ZeroDivisionError\224\223\224\214\020division by zero\224\205\224\206\224."
.size .const.pickledata.2671449003720, 68
.type .const.f,@object
.const.f:
.asciz "f"
.size .const.f, 2
.type ".const.Fatal error: missing _dynfunc.Closure",@object
.align 16
".const.Fatal error: missing _dynfunc.Closure":
.asciz "Fatal error: missing _dynfunc.Closure"
.size ".const.Fatal error: missing _dynfunc.Closure", 38
.type ".const.missing Environment",@object
.align 16
".const.missing Environment":
.asciz "missing Environment"
.size ".const.missing Environment", 20
.section ".note.GNU-stack","",@progbits
from numba import *
dtype = float32
width = int32(8)
@jit(nopython=True, nogil=True)
def f(x, y):
acc = dtype(0.0)
for i in range(width):
a = (x[i] + y[i]) * x[i]
b = (x[i] - y[i]) / y[i]
c = x[i] * y[i] + x[i]
d = x[i] * y[i] * a + b
e = a / b * c - d
acc += (a - b) / (c - d) * e
return acc
import numpy as np
a, b = np.r_[:2*width].astype(
getattr(np, dtype.name)).reshape((2, 8))
c = f(a, b)
_, = f.inspect_asm().values()
with open('test-numba-autovec.asm', 'w') as fd:
fd.write(_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment