Created
January 27, 2015 18:44
-
-
Save ArchRobison/cb1aab380905fdc1255f to your computer and use it in GitHub Desktop.
Attempts to vectorize with two of the variants in https://gist.github.com/simonbyrne/62f25608aaf831de5996 .
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $ cat truncloop.jl | |
| # use floating point AND instructions (andps/andpd) | |
| # https://github.com/JuliaLang/julia/issues/9868 | |
| function and_float(x::Float64,y::Float64) | |
| Base.llvmcall("""%av = insertelement<2 x double> undef, double %0, i32 0 | |
| %bv = insertelement<2 x double> undef, double %1, i32 0 | |
| %ai = bitcast <2 x double> %av to <2 x i64> | |
| %bi = bitcast <2 x double> %bv to <2 x i64> | |
| %and.i = and <2 x i64> %ai, %bi | |
| %cf = bitcast <2 x i64> %and.i to <2 x double> | |
| %cfe = extractelement<2 x double> %cf, i32 0 | |
| ret double %cfe""",Float64,(Float64,Float64),x,y) | |
| end | |
| # 3) AND out bits | |
| t3(x::Float64) = reinterpret(Float64,reinterpret(UInt64,x) & 0xffff_ffff_0000_0000) | |
| # 4) AND out bits using and_float | |
| t4(x::Float64) = and_float(x,reinterpret(Float64, 0xffff_ffff_0000_0000)) | |
| function loop3(a,b) | |
| @inbounds @simd for i = 1:length(a) | |
| a[i] = t3(b[i]) | |
| end | |
| end | |
| function loop4(a,b) | |
| @inbounds @simd for i = 1:length(a) | |
| a[i] = t4(b[i]) | |
| end | |
| end | |
| println("loop3 ----------------------------------------") | |
| code_native(loop3,(Matrix{Float64},Matrix{Float64})) | |
| println() | |
| println("loop4 ----------------------------------------") | |
| code_native(loop4,(Matrix{Float64},Matrix{Float64})) | |
| println() | |
| $ julia truncloop.jl | |
| loop3 ---------------------------------------- | |
| .text | |
| Filename: truncloop.jl | |
| Source line: 0 | |
| push rbp | |
| mov rbp, rsp | |
| Source line: 21 | |
| mov rax, qword ptr [rdi + 16] | |
| xor edx, edx | |
| Source line: 49 | |
| test rax, rax | |
| cmovns rdx, rax | |
| dec rdx | |
| jo L240 | |
| inc rdx | |
| jo L240 | |
| Source line: 50 | |
| test rdx, rdx | |
| jle L230 | |
| Source line: 21 | |
| mov r8, qword ptr [rdi + 8] | |
| mov r9, qword ptr [rsi + 8] | |
| xor edi, edi | |
| test rdx, rdx | |
| je L172 | |
| xor edi, edi | |
| Source line: 56 | |
| mov r10, rdx | |
| and r10, -16 | |
| je L167 | |
| lea rdi, qword ptr [r8 + 96] | |
| lea rcx, qword ptr [r9 + 96] | |
| mov rax, rdx | |
| and rax, -16 | |
| movabs rsi, 140719585243136 | |
| Source line: 22 | |
| vbroadcastsd ymm0, qword ptr [rsi] | |
| nop word ptr cs:[rax + rax] | |
| L112: vandps ymm1, ymm0, ymmword ptr [rcx - 96] | |
| vandps ymm2, ymm0, ymmword ptr [rcx - 64] | |
| vandps ymm3, ymm0, ymmword ptr [rcx - 32] | |
| vandps ymm4, ymm0, ymmword ptr [rcx] | |
| vmovups ymmword ptr [rdi - 96], ymm1 | |
| vmovups ymmword ptr [rdi - 64], ymm2 | |
| vmovups ymmword ptr [rdi - 32], ymm3 | |
| vmovups ymmword ptr [rdi], ymm4 | |
| Source line: 56 | |
| sub rdi, -128 | |
| sub rcx, -128 | |
| add rax, -16 | |
| jne L112 | |
| mov rdi, r10 | |
| L167: cmp rdx, rdi | |
| je L230 | |
| L172: lea rax, qword ptr [r8 + 8*rdi] | |
| lea rcx, qword ptr [r9 + 8*rdi] | |
| sub rdx, rdi | |
| movabs rsi, -4294967296 | |
| nop word ptr cs:[rax + rax] | |
| L208: mov rdi, qword ptr [rcx] | |
| Source line: 22 | |
| and rdi, rsi | |
| mov qword ptr [rax], rdi | |
| Source line: 57 | |
| add rax, 8 | |
| add rcx, 8 | |
| dec rdx | |
| jne L208 | |
| L230: mov eax, 29730704 | |
| Source line: 64 | |
| pop rbp | |
| vzeroupper | |
| ret | |
| Source line: 49 | |
| L240: movabs rax, jl_overflow_exception | |
| mov rdi, qword ptr [rax] | |
| movabs rax, jl_throw_with_superfluous_argument | |
| mov esi, 49 | |
| call rax | |
| loop4 ---------------------------------------- | |
| .text | |
| Filename: truncloop.jl | |
| Source line: 0 | |
| push rbp | |
| mov rbp, rsp | |
| Source line: 27 | |
| mov rcx, qword ptr [rdi + 16] | |
| xor eax, eax | |
| Source line: 49 | |
| test rcx, rcx | |
| cmovns rax, rcx | |
| dec rax | |
| jo L96 | |
| inc rax | |
| jo L96 | |
| Source line: 50 | |
| test rax, rax | |
| jle L89 | |
| Source line: 27 | |
| mov rcx, qword ptr [rdi + 8] | |
| mov rdx, qword ptr [rsi + 8] | |
| movabs rsi, -4294967296 | |
| Source line: 28 | |
| vmovq xmm0, rsi | |
| nop word ptr [rax + rax] | |
| L64: vmovsd xmm1, qword ptr [rdx] | |
| vandpd xmm1, xmm1, xmm0 | |
| vmovlpd qword ptr [rcx], xmm1 | |
| Source line: 57 | |
| add rcx, 8 | |
| add rdx, 8 | |
| dec rax | |
| jne L64 | |
| L89: mov eax, 29730704 | |
| Source line: 64 | |
| pop rbp | |
| ret | |
| Source line: 49 | |
| L96: movabs rax, jl_overflow_exception | |
| mov rdi, qword ptr [rax] | |
| movabs rax, jl_throw_with_superfluous_argument | |
| mov esi, 49 | |
| call rax |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment