Skip to content

Instantly share code, notes, and snippets.

@simonster
Last active August 29, 2015 14:06
Show Gist options
  • Save simonster/a3b691e71cc2b3826e39 to your computer and use it in GitHub Desktop.
Save simonster/a3b691e71cc2b3826e39 to your computer and use it in GitHub Desktop.
Faster vectorized integer division
import Base.rem, Base.div, Base.divrem
unsigned_type(::Int8) = Uint8
unsigned_type(::Int16) = Uint16
unsigned_type(::Int32) = Uint32
unsigned_type(::Int64) = Uint64
unsigned_type(::Int128) = Uint128
immutable SignedMultiplicativeInverse{T<:Signed}
divisor::T
multiplier::T
addmul::Int8
shift::Uint8
function SignedMultiplicativeInverse(d::T)
ut = unsigned_type(d)
signedmin::ut = typemin(d)
ad::ut = abs(d)
ad <= 1 && error("cannot compute magic for d == $d")
t::ut = signedmin + signbit(d)
anc::ut = t - 1 - rem(t, ad) # absolute value of nc
p = sizeof(d)*8 - 1 # initialize p
q1::ut, r1::ut = divrem(signedmin, anc)
q2::ut, r2::ut = divrem(signedmin, ad)
while true
p += 1
q1 *= 2 # update q1 = 2p/abs(nc)
r1 *= 2 # update r1 = rem(2p/abs(nc))
if r1 >= anc # must be unsigned comparison
q1 += 1
r1 -= anc
end
q2 *= 2 # update q2 = 2p/abs(d)
r2 *= 2 # update r2 = rem(2p/abs(d))
if r2 >= ad # must be unsigned comparison
q2 += 1
r2 -= ad
end
delta::ut = ad - r2
(q1 < delta || (q1 == delta && r1 == 0)) || break
end
m = flipsign(oftype(d, q2 + 1), d) # resulting magic number
s = p - sizeof(d)*8 # resulting shift
new(d, m, d > 0 && m < 0 ? int8(1) : d < 0 && m > 0 ? int8(-1) : int8(0), uint8(s))
end
end
SignedMultiplicativeInverse(x::Signed) = SignedMultiplicativeInverse{typeof(x)}(x)
immutable UnsignedMultiplicativeInverse{T<:Unsigned}
divisor::T
multiplier::T
add::Bool
shift::Uint8
function UnsignedMultiplicativeInverse(d::T)
d == 0 && error("cannot compute magic for d == 0")
u2 = convert(T, 2)
add = false
signedmin::typeof(d) = one(d) << (sizeof(d)*8-1)
signedmax::typeof(d) = signedmin - 1
allones::typeof(d) = zero(d) - 1
nc::typeof(d) = allones - rem(convert(T, allones - d), d)
p = 8*sizeof(d) - 1 # initialize p
q1::typeof(d), r1::typeof(d) = divrem(signedmin, nc)
q2::typeof(d), r2::typeof(d) = divrem(signedmax, d)
while true
p += 1
if r1 >= convert(T, nc - r1)
q1 = q1 + q1 + 1 # update q1
r1 = r1 + r1 - nc # update r1
else
q1 = q1 + q1 # update q1
r1 = r1 + r1 # update r1
end
if convert(T, r2 + 1) >= convert(T, d - r2)
add |= q2 >= signedmax
q2 = q2 + q2 + 1 # update q2
r2 = r2 + r2 + 1 - d # update r2
else
add |= q2 >= signedmin
q2 = q2 + q2 # update q2
r2 = r2 + r2 + 1 # update r2
end
delta::typeof(d) = d - 1 - r2
(p < sizeof(d)*16 && (q1 < delta || (q1 == delta && r1 == 0))) || break
end
m = q2 + 1 # resulting magic number
s = p - sizeof(d)*8 - add # resulting shift
new(d, convert(typeof(d), m), add, uint8(s))
end
end
UnsignedMultiplicativeInverse(x::Unsigned) = UnsignedMultiplicativeInverse{typeof(x)}(x)
function div{T}(a::T, b::SignedMultiplicativeInverse{T})
x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8)
x = convert(T, x + convert(T, a*b.addmul))
convert(T, signbit(x) + (x >> b.shift))
end
function div{T}(a::T, b::UnsignedMultiplicativeInverse{T})
x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8)
x = ifelse(b.add, convert(T, convert(T, (convert(T, a - x) >>> 1)) + x), x)
x >>> b.shift
end
rem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T})) =
a - div(a, b)*b.divisor
function divrem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T}))
d = div(a, b)
(d, a - d*b.divisor)
end
multiplicativeinverse(x::Signed) = SignedMultiplicativeInverse(x)
multiplicativeinverse(x::Unsigned) = UnsignedMultiplicativeInverse(x)
A = rand(Int64, 10^8)
B = rand(Uint64, 10^8)
n = rand(Int64)
m = rand(Uint64)
x1 = div(A, n)
x2 = div(B, m)
println("using signed intrinsics")
gc()
gc_disable()
@time for i = 1:2; div(A, n); end
gc_enable()
println("using unsigned intrinsics")
gc()
gc_disable()
@time for i = 1:2; div(B, m); end
gc_enable()
import Core.Intrinsics.llvmcall
function unsafe_div(num::Array{Int64}, den::Int64)
out = similar(num)
isempty(num) && return out
den == 0 && throw(DivideError())
@inbounds for i = 1:length(out)
out[i] = llvmcall("""%3 = sdiv i64 %0, %1
ret i64 %3""",
Int64, (Int64, Int64), num[i], den)
end
out
end
function unsafe_div(num::Array{Uint64}, den::Uint64)
out = similar(num)
isempty(num) && return out
den == 0 && throw(DivideError())
@inbounds for i = 1:length(out)
out[i] = llvmcall("""%3 = udiv i64 %0, %1
ret i64 %3""",
Uint64, (Uint64, Uint64), num[i], den)
end
out
end
unsafe_div(A, n)
unsafe_div(B, m)
println("signed without check")
gc()
gc_disable()
@time for i = 1:2; unsafe_div(A, n); end
gc_enable()
println("unsigned without check")
gc()
gc_disable()
@time for i = 1:2; unsafe_div(B, m); end
gc_enable()
function div{T<:Signed}(num::StridedArray{T}, den::T)
out = similar(num)
isempty(num) && return out
if den == 0
throw(DivideError())
elseif den == 1
copy!(out, num)
elseif den == -1
for i = 1:length(out)
@inbounds out[i] = -num[i]
end
else
m = multiplicativeinverse(den)
@inbounds if m.addmul == -1
for i = 1:length(out)
x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) - num[i])
out[i] = signbit(x) + (x >> m.shift)
end
elseif m.addmul == 1
for i = 1:length(out)
x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) + num[i])
out[i] = signbit(x) + (x >> m.shift)
end
else
for i = 1:length(out)
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
out[i] = signbit(x) + (x >> m.shift)
end
end
end
out
end
function div{T<:Unsigned}(num::StridedArray{T}, den::T)
out = similar(num)
isempty(num) && return out
if den == 0
throw(DivideError())
else
m = multiplicativeinverse(den)
@inbounds if m.add
for i = 1:length(out)
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
x = convert(T, convert(T, (convert(T, num[i] - x) >>> 1)) + x)
out[i] = (x >>> m.shift)
end
else
for i = 1:length(out)
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
out[i] = (x >>> m.shift)
end
end
end
out
end
y1 = div(A, n)
y2 = div(B, m)
println("using signed inverse")
gc()
gc_disable()
@time for i = 1:2; div(A, n); end
gc_enable()
println("using unsigned inverse")
gc()
gc_disable()
@time for i = 1:2; div(B, m); end
gc_enable()
@assert x1 == y1
@assert x2 == y2
using signed intrinsics
elapsed time: 1.690360625 seconds (1600000096 bytes allocated)
using unsigned intrinsics
elapsed time: 1.558626973 seconds (1600000096 bytes allocated)
signed without check
elapsed time: 1.493279208 seconds (1600000096 bytes allocated)
unsigned without check
elapsed time: 1.263156649 seconds (1600000096 bytes allocated)
using signed inverse
elapsed time: 0.525588555 seconds (1600000096 bytes allocated)
using unsigned inverse
elapsed time: 0.485345448 seconds (1600000096 bytes allocated)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment