Last active
August 29, 2015 14:06
-
-
Save simonster/a3b691e71cc2b3826e39 to your computer and use it in GitHub Desktop.
Faster vectorized integer division
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base.rem, Base.div, Base.divrem | |
unsigned_type(::Int8) = Uint8 | |
unsigned_type(::Int16) = Uint16 | |
unsigned_type(::Int32) = Uint32 | |
unsigned_type(::Int64) = Uint64 | |
unsigned_type(::Int128) = Uint128 | |
immutable SignedMultiplicativeInverse{T<:Signed} | |
divisor::T | |
multiplier::T | |
addmul::Int8 | |
shift::Uint8 | |
function SignedMultiplicativeInverse(d::T) | |
ut = unsigned_type(d) | |
signedmin::ut = typemin(d) | |
ad::ut = abs(d) | |
ad <= 1 && error("cannot compute magic for d == $d") | |
t::ut = signedmin + signbit(d) | |
anc::ut = t - 1 - rem(t, ad) # absolute value of nc | |
p = sizeof(d)*8 - 1 # initialize p | |
q1::ut, r1::ut = divrem(signedmin, anc) | |
q2::ut, r2::ut = divrem(signedmin, ad) | |
while true | |
p += 1 | |
q1 *= 2 # update q1 = 2p/abs(nc) | |
r1 *= 2 # update r1 = rem(2p/abs(nc)) | |
if r1 >= anc # must be unsigned comparison | |
q1 += 1 | |
r1 -= anc | |
end | |
q2 *= 2 # update q2 = 2p/abs(d) | |
r2 *= 2 # update r2 = rem(2p/abs(d)) | |
if r2 >= ad # must be unsigned comparison | |
q2 += 1 | |
r2 -= ad | |
end | |
delta::ut = ad - r2 | |
(q1 < delta || (q1 == delta && r1 == 0)) || break | |
end | |
m = flipsign(oftype(d, q2 + 1), d) # resulting magic number | |
s = p - sizeof(d)*8 # resulting shift | |
new(d, m, d > 0 && m < 0 ? int8(1) : d < 0 && m > 0 ? int8(-1) : int8(0), uint8(s)) | |
end | |
end | |
SignedMultiplicativeInverse(x::Signed) = SignedMultiplicativeInverse{typeof(x)}(x) | |
immutable UnsignedMultiplicativeInverse{T<:Unsigned} | |
divisor::T | |
multiplier::T | |
add::Bool | |
shift::Uint8 | |
function UnsignedMultiplicativeInverse(d::T) | |
d == 0 && error("cannot compute magic for d == 0") | |
u2 = convert(T, 2) | |
add = false | |
signedmin::typeof(d) = one(d) << (sizeof(d)*8-1) | |
signedmax::typeof(d) = signedmin - 1 | |
allones::typeof(d) = zero(d) - 1 | |
nc::typeof(d) = allones - rem(convert(T, allones - d), d) | |
p = 8*sizeof(d) - 1 # initialize p | |
q1::typeof(d), r1::typeof(d) = divrem(signedmin, nc) | |
q2::typeof(d), r2::typeof(d) = divrem(signedmax, d) | |
while true | |
p += 1 | |
if r1 >= convert(T, nc - r1) | |
q1 = q1 + q1 + 1 # update q1 | |
r1 = r1 + r1 - nc # update r1 | |
else | |
q1 = q1 + q1 # update q1 | |
r1 = r1 + r1 # update r1 | |
end | |
if convert(T, r2 + 1) >= convert(T, d - r2) | |
add |= q2 >= signedmax | |
q2 = q2 + q2 + 1 # update q2 | |
r2 = r2 + r2 + 1 - d # update r2 | |
else | |
add |= q2 >= signedmin | |
q2 = q2 + q2 # update q2 | |
r2 = r2 + r2 + 1 # update r2 | |
end | |
delta::typeof(d) = d - 1 - r2 | |
(p < sizeof(d)*16 && (q1 < delta || (q1 == delta && r1 == 0))) || break | |
end | |
m = q2 + 1 # resulting magic number | |
s = p - sizeof(d)*8 - add # resulting shift | |
new(d, convert(typeof(d), m), add, uint8(s)) | |
end | |
end | |
UnsignedMultiplicativeInverse(x::Unsigned) = UnsignedMultiplicativeInverse{typeof(x)}(x) | |
function div{T}(a::T, b::SignedMultiplicativeInverse{T}) | |
x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8) | |
x = convert(T, x + convert(T, a*b.addmul)) | |
convert(T, signbit(x) + (x >> b.shift)) | |
end | |
function div{T}(a::T, b::UnsignedMultiplicativeInverse{T}) | |
x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8) | |
x = ifelse(b.add, convert(T, convert(T, (convert(T, a - x) >>> 1)) + x), x) | |
x >>> b.shift | |
end | |
rem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T})) = | |
a - div(a, b)*b.divisor | |
function divrem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T})) | |
d = div(a, b) | |
(d, a - d*b.divisor) | |
end | |
multiplicativeinverse(x::Signed) = SignedMultiplicativeInverse(x) | |
multiplicativeinverse(x::Unsigned) = UnsignedMultiplicativeInverse(x) | |
A = rand(Int64, 10^8) | |
B = rand(Uint64, 10^8) | |
n = rand(Int64) | |
m = rand(Uint64) | |
x1 = div(A, n) | |
x2 = div(B, m) | |
println("using signed intrinsics") | |
gc() | |
gc_disable() | |
@time for i = 1:2; div(A, n); end | |
gc_enable() | |
println("using unsigned intrinsics") | |
gc() | |
gc_disable() | |
@time for i = 1:2; div(B, m); end | |
gc_enable() | |
import Core.Intrinsics.llvmcall | |
function unsafe_div(num::Array{Int64}, den::Int64) | |
out = similar(num) | |
isempty(num) && return out | |
den == 0 && throw(DivideError()) | |
@inbounds for i = 1:length(out) | |
out[i] = llvmcall("""%3 = sdiv i64 %0, %1 | |
ret i64 %3""", | |
Int64, (Int64, Int64), num[i], den) | |
end | |
out | |
end | |
function unsafe_div(num::Array{Uint64}, den::Uint64) | |
out = similar(num) | |
isempty(num) && return out | |
den == 0 && throw(DivideError()) | |
@inbounds for i = 1:length(out) | |
out[i] = llvmcall("""%3 = udiv i64 %0, %1 | |
ret i64 %3""", | |
Uint64, (Uint64, Uint64), num[i], den) | |
end | |
out | |
end | |
unsafe_div(A, n) | |
unsafe_div(B, m) | |
println("signed without check") | |
gc() | |
gc_disable() | |
@time for i = 1:2; unsafe_div(A, n); end | |
gc_enable() | |
println("unsigned without check") | |
gc() | |
gc_disable() | |
@time for i = 1:2; unsafe_div(B, m); end | |
gc_enable() | |
function div{T<:Signed}(num::StridedArray{T}, den::T) | |
out = similar(num) | |
isempty(num) && return out | |
if den == 0 | |
throw(DivideError()) | |
elseif den == 1 | |
copy!(out, num) | |
elseif den == -1 | |
for i = 1:length(out) | |
@inbounds out[i] = -num[i] | |
end | |
else | |
m = multiplicativeinverse(den) | |
@inbounds if m.addmul == -1 | |
for i = 1:length(out) | |
x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) - num[i]) | |
out[i] = signbit(x) + (x >> m.shift) | |
end | |
elseif m.addmul == 1 | |
for i = 1:length(out) | |
x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) + num[i]) | |
out[i] = signbit(x) + (x >> m.shift) | |
end | |
else | |
for i = 1:length(out) | |
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) | |
out[i] = signbit(x) + (x >> m.shift) | |
end | |
end | |
end | |
out | |
end | |
function div{T<:Unsigned}(num::StridedArray{T}, den::T) | |
out = similar(num) | |
isempty(num) && return out | |
if den == 0 | |
throw(DivideError()) | |
else | |
m = multiplicativeinverse(den) | |
@inbounds if m.add | |
for i = 1:length(out) | |
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) | |
x = convert(T, convert(T, (convert(T, num[i] - x) >>> 1)) + x) | |
out[i] = (x >>> m.shift) | |
end | |
else | |
for i = 1:length(out) | |
x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) | |
out[i] = (x >>> m.shift) | |
end | |
end | |
end | |
out | |
end | |
y1 = div(A, n) | |
y2 = div(B, m) | |
println("using signed inverse") | |
gc() | |
gc_disable() | |
@time for i = 1:2; div(A, n); end | |
gc_enable() | |
println("using unsigned inverse") | |
gc() | |
gc_disable() | |
@time for i = 1:2; div(B, m); end | |
gc_enable() | |
@assert x1 == y1 | |
@assert x2 == y2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using signed intrinsics | |
elapsed time: 1.690360625 seconds (1600000096 bytes allocated) | |
using unsigned intrinsics | |
elapsed time: 1.558626973 seconds (1600000096 bytes allocated) | |
signed without check | |
elapsed time: 1.493279208 seconds (1600000096 bytes allocated) | |
unsigned without check | |
elapsed time: 1.263156649 seconds (1600000096 bytes allocated) | |
using signed inverse | |
elapsed time: 0.525588555 seconds (1600000096 bytes allocated) | |
using unsigned inverse | |
elapsed time: 0.485345448 seconds (1600000096 bytes allocated) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment