Last active
August 29, 2015 14:05
-
-
Save jiahao/f2e20f2b42118ae06b79 to your computer and use it in GitHub Desktop.
An implementation of doublelength floating point arithmetic as described in Dekker, 1971. http://link.springer.com/article/10.1007%2FBF01397083 (See also https://github.com/jiahao/DoublelengthFloat.jl)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base: +, -, ⋅, *, /, √, convert | |
export DoublelengthFloat | |
immutable DoublelengthFloat{T<:FloatingPoint} <: FloatingPoint | |
head :: T | |
tail :: T | |
end | |
DoublelengthFloat(x::FloatingPoint) = DoublelengthFloat(x, zero(x)) | |
convert{T<:FloatingPoint}(::Type{DoublelengthFloat{T}}, x::T) = DoublelengthFloat(x) | |
function +{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T}) | |
x, xx = a.head, a.tail | |
y, yy = b.head, b.tail | |
r = x + y | |
s = abs(x) > abs(y) ? x - r + y + xx + xx : | |
y - r + x + xx + yy | |
z = r + s | |
zz = r - z + s | |
DoublelengthFloat(z, zz) | |
end | |
function -{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T}) | |
x, xx = a.head, a.tail | |
y, yy = b.head, b.tail | |
r = x - y | |
s = abs(x) > abs(y) ? x - r - y - yy + xx : | |
-y - r + x + xx - yy | |
z = r + s | |
zz = r - z + s | |
DoublelengthFloat(z, zz) | |
end | |
#mul12 | |
function ⋅{T<:FloatingPoint}(x::T, y::T) | |
const t = 53 #Float64 | |
const constant = 2^(t-t÷2)+1 | |
p = x * constant | |
hx = x - p + p | |
tx = x - hx | |
p = y * constant | |
hy = y - p + p | |
ty = y - hy | |
p = hx * hy | |
q = hx * ty + tx * hy | |
z = p + q | |
zz = p - z + q + tx * ty | |
DoublelengthFloat(z, zz) | |
end | |
#mul2 | |
function *{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T}) | |
x, xx = a.head, a.tail | |
y, yy = b.head, b.tail | |
C = x ⋅ y #mul12 | |
c, cc = C.head, C.tail | |
cc = x*yy + xx*y + cc | |
z = c + cc | |
zz = c - z + cc | |
DoublelengthFloat(z, zz) | |
end | |
#div2 | |
function /{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T}) | |
x, xx = a.head, a.tail | |
y, yy = b.head, b.tail | |
c = x / y | |
U = c ⋅ y | |
u, uu = U.head, U.tail | |
cc = (x - u - uu + xx - c * yy)/y | |
z = c + cc | |
zz = c - z + cc | |
DoublelengthFloat(z, zz) | |
end | |
function √{T<:FloatingPoint}(a::DoublelengthFloat{T}) | |
x, xx = a.head, a.tail | |
x ≥ 0 || throw(DomainError()) | |
c = √x | |
U = c ⋅ c | |
u, uu = U.head, U.tail | |
cc = (x - u - uu + xx) * 0.5/c | |
y = c + cc | |
yy = c - y + cc | |
DoublelengthFloat(y, yy) | |
end | |
@assert DoublelengthFloat(1.0, eps()*0.125) + DoublelengthFloat(1.0, eps()*0.25) == DoublelengthFloat(2.0, 0.375eps()) | |
@assert DoublelengthFloat(1.0, 0.125eps()) - DoublelengthFloat(1.0, 0.25eps()) == DoublelengthFloat(-0.125eps(), 0.0) | |
@assert (1.0+eps()) ⋅ (1.0+2eps()) == DoublelengthFloat(1.0+3eps(), 2eps()^2) | |
@assert DoublelengthFloat(1.0, 0.5eps()) * DoublelengthFloat(1.0, 0.75eps()) == DoublelengthFloat(1.0+eps(), 0.25eps()) | |
@assert DoublelengthFloat(1.0, 0.125eps()) / DoublelengthFloat(1.0, 128eps()) == DoublelengthFloat(1.0-128eps(), 0.125eps()) | |
@assert sqrt(DoublelengthFloat(1.0, 0.25eps())) == DoublelengthFloat(1.0, 0.125eps()) | |
n=10^7 | |
d=randn(n) | |
z=randn(n) | |
function f(n, d, z) | |
thesum = DoublelengthFloat(0.0) | |
for i=1:n | |
zz=DoublelengthFloat(z[i]) | |
dd=DoublelengthFloat(d[i]) | |
thesum += zz^2/dd | |
end | |
thesum | |
end | |
for (fp, fpc) in [(DoublelengthFloat{Float64}, DoublelengthFloat)] | |
@eval begin | |
function g(n, d, z, ::Type{$fp}) | |
thesum = $fpc(0.0)#convert($fp, 0.0) | |
for i=1:n | |
zz=$fpc(z[i])#zz=convert($fp, z[i]) | |
dd=$fpc(d[i])#dd=convert($fp, d[i]) | |
thesum += zz^2/dd | |
end | |
thesum | |
end | |
function h(n, d, z, ::Type{$fp}) | |
thesum = convert($fp, 0.0) | |
for i=1:n | |
zz=convert($fp, z[i]) | |
dd=convert($fp, d[i]) | |
thesum += zz^2/dd | |
end | |
thesum | |
end | |
end | |
end | |
f(1, [1.], [1.]) | |
@time f(n, d, z) | |
for ff in [g, h] | |
ff(1, [1.], [1.], DoublelengthFloat{Float64}) #Precompile | |
@time ff(n, d, z, DoublelengthFloat{Float64}) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment