Skip to content

Instantly share code, notes, and snippets.

@ArchRobison
Created January 27, 2015 18:44
Show Gist options
  • Select an option

  • Save ArchRobison/cb1aab380905fdc1255f to your computer and use it in GitHub Desktop.

Select an option

Save ArchRobison/cb1aab380905fdc1255f to your computer and use it in GitHub Desktop.
Attempts to vectorize with two of the variants in https://gist.github.com/simonbyrne/62f25608aaf831de5996 .
$ cat truncloop.jl
# use floating point AND instructions (andps/andpd)
# https://github.com/JuliaLang/julia/issues/9868
function and_float(x::Float64,y::Float64)
Base.llvmcall("""%av = insertelement<2 x double> undef, double %0, i32 0
%bv = insertelement<2 x double> undef, double %1, i32 0
%ai = bitcast <2 x double> %av to <2 x i64>
%bi = bitcast <2 x double> %bv to <2 x i64>
%and.i = and <2 x i64> %ai, %bi
%cf = bitcast <2 x i64> %and.i to <2 x double>
%cfe = extractelement<2 x double> %cf, i32 0
ret double %cfe""",Float64,(Float64,Float64),x,y)
end
# 3) AND out bits
t3(x::Float64) = reinterpret(Float64,reinterpret(UInt64,x) & 0xffff_ffff_0000_0000)
# 4) AND out bits using and_float
t4(x::Float64) = and_float(x,reinterpret(Float64, 0xffff_ffff_0000_0000))
function loop3(a,b)
@inbounds @simd for i = 1:length(a)
a[i] = t3(b[i])
end
end
function loop4(a,b)
@inbounds @simd for i = 1:length(a)
a[i] = t4(b[i])
end
end
println("loop3 ----------------------------------------")
code_native(loop3,(Matrix{Float64},Matrix{Float64}))
println()
println("loop4 ----------------------------------------")
code_native(loop4,(Matrix{Float64},Matrix{Float64}))
println()
$ julia truncloop.jl
loop3 ----------------------------------------
.text
Filename: truncloop.jl
Source line: 0
push rbp
mov rbp, rsp
Source line: 21
mov rax, qword ptr [rdi + 16]
xor edx, edx
Source line: 49
test rax, rax
cmovns rdx, rax
dec rdx
jo L240
inc rdx
jo L240
Source line: 50
test rdx, rdx
jle L230
Source line: 21
mov r8, qword ptr [rdi + 8]
mov r9, qword ptr [rsi + 8]
xor edi, edi
test rdx, rdx
je L172
xor edi, edi
Source line: 56
mov r10, rdx
and r10, -16
je L167
lea rdi, qword ptr [r8 + 96]
lea rcx, qword ptr [r9 + 96]
mov rax, rdx
and rax, -16
movabs rsi, 140719585243136
Source line: 22
vbroadcastsd ymm0, qword ptr [rsi]
nop word ptr cs:[rax + rax]
L112: vandps ymm1, ymm0, ymmword ptr [rcx - 96]
vandps ymm2, ymm0, ymmword ptr [rcx - 64]
vandps ymm3, ymm0, ymmword ptr [rcx - 32]
vandps ymm4, ymm0, ymmword ptr [rcx]
vmovups ymmword ptr [rdi - 96], ymm1
vmovups ymmword ptr [rdi - 64], ymm2
vmovups ymmword ptr [rdi - 32], ymm3
vmovups ymmword ptr [rdi], ymm4
Source line: 56
sub rdi, -128
sub rcx, -128
add rax, -16
jne L112
mov rdi, r10
L167: cmp rdx, rdi
je L230
L172: lea rax, qword ptr [r8 + 8*rdi]
lea rcx, qword ptr [r9 + 8*rdi]
sub rdx, rdi
movabs rsi, -4294967296
nop word ptr cs:[rax + rax]
L208: mov rdi, qword ptr [rcx]
Source line: 22
and rdi, rsi
mov qword ptr [rax], rdi
Source line: 57
add rax, 8
add rcx, 8
dec rdx
jne L208
L230: mov eax, 29730704
Source line: 64
pop rbp
vzeroupper
ret
Source line: 49
L240: movabs rax, jl_overflow_exception
mov rdi, qword ptr [rax]
movabs rax, jl_throw_with_superfluous_argument
mov esi, 49
call rax
loop4 ----------------------------------------
.text
Filename: truncloop.jl
Source line: 0
push rbp
mov rbp, rsp
Source line: 27
mov rcx, qword ptr [rdi + 16]
xor eax, eax
Source line: 49
test rcx, rcx
cmovns rax, rcx
dec rax
jo L96
inc rax
jo L96
Source line: 50
test rax, rax
jle L89
Source line: 27
mov rcx, qword ptr [rdi + 8]
mov rdx, qword ptr [rsi + 8]
movabs rsi, -4294967296
Source line: 28
vmovq xmm0, rsi
nop word ptr [rax + rax]
L64: vmovsd xmm1, qword ptr [rdx]
vandpd xmm1, xmm1, xmm0
vmovlpd qword ptr [rcx], xmm1
Source line: 57
add rcx, 8
add rdx, 8
dec rax
jne L64
L89: mov eax, 29730704
Source line: 64
pop rbp
ret
Source line: 49
L96: movabs rax, jl_overflow_exception
mov rdi, qword ptr [rax]
movabs rax, jl_throw_with_superfluous_argument
mov esi, 49
call rax
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment