Skip to content

Instantly share code, notes, and snippets.

@jiahao
Last active August 29, 2015 14:05
Show Gist options
  • Save jiahao/f2e20f2b42118ae06b79 to your computer and use it in GitHub Desktop.
Save jiahao/f2e20f2b42118ae06b79 to your computer and use it in GitHub Desktop.
An implementation of doublelength floating point arithmetic as described in Dekker, 1971. http://link.springer.com/article/10.1007%2FBF01397083 (See also https://github.com/jiahao/DoublelengthFloat.jl)
import Base: +, -, ⋅, *, /, √, convert
export DoublelengthFloat
immutable DoublelengthFloat{T<:FloatingPoint} <: FloatingPoint
head :: T
tail :: T
end
DoublelengthFloat(x::FloatingPoint) = DoublelengthFloat(x, zero(x))
convert{T<:FloatingPoint}(::Type{DoublelengthFloat{T}}, x::T) = DoublelengthFloat(x)
function +{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T})
x, xx = a.head, a.tail
y, yy = b.head, b.tail
r = x + y
s = abs(x) > abs(y) ? x - r + y + xx + xx :
y - r + x + xx + yy
z = r + s
zz = r - z + s
DoublelengthFloat(z, zz)
end
function -{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T})
x, xx = a.head, a.tail
y, yy = b.head, b.tail
r = x - y
s = abs(x) > abs(y) ? x - r - y - yy + xx :
-y - r + x + xx - yy
z = r + s
zz = r - z + s
DoublelengthFloat(z, zz)
end
#mul12
function ⋅{T<:FloatingPoint}(x::T, y::T)
const t = 53 #Float64
const constant = 2^(t-t÷2)+1
p = x * constant
hx = x - p + p
tx = x - hx
p = y * constant
hy = y - p + p
ty = y - hy
p = hx * hy
q = hx * ty + tx * hy
z = p + q
zz = p - z + q + tx * ty
DoublelengthFloat(z, zz)
end
#mul2
function *{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T})
x, xx = a.head, a.tail
y, yy = b.head, b.tail
C = x ⋅ y #mul12
c, cc = C.head, C.tail
cc = x*yy + xx*y + cc
z = c + cc
zz = c - z + cc
DoublelengthFloat(z, zz)
end
#div2
function /{T<:FloatingPoint}(a::DoublelengthFloat{T}, b::DoublelengthFloat{T})
x, xx = a.head, a.tail
y, yy = b.head, b.tail
c = x / y
U = c ⋅ y
u, uu = U.head, U.tail
cc = (x - u - uu + xx - c * yy)/y
z = c + cc
zz = c - z + cc
DoublelengthFloat(z, zz)
end
function √{T<:FloatingPoint}(a::DoublelengthFloat{T})
x, xx = a.head, a.tail
x ≥ 0 || throw(DomainError())
c = √x
U = c ⋅ c
u, uu = U.head, U.tail
cc = (x - u - uu + xx) * 0.5/c
y = c + cc
yy = c - y + cc
DoublelengthFloat(y, yy)
end
@assert DoublelengthFloat(1.0, eps()*0.125) + DoublelengthFloat(1.0, eps()*0.25) == DoublelengthFloat(2.0, 0.375eps())
@assert DoublelengthFloat(1.0, 0.125eps()) - DoublelengthFloat(1.0, 0.25eps()) == DoublelengthFloat(-0.125eps(), 0.0)
@assert (1.0+eps()) ⋅ (1.0+2eps()) == DoublelengthFloat(1.0+3eps(), 2eps()^2)
@assert DoublelengthFloat(1.0, 0.5eps()) * DoublelengthFloat(1.0, 0.75eps()) == DoublelengthFloat(1.0+eps(), 0.25eps())
@assert DoublelengthFloat(1.0, 0.125eps()) / DoublelengthFloat(1.0, 128eps()) == DoublelengthFloat(1.0-128eps(), 0.125eps())
@assert sqrt(DoublelengthFloat(1.0, 0.25eps())) == DoublelengthFloat(1.0, 0.125eps())
n=10^7
d=randn(n)
z=randn(n)
function f(n, d, z)
thesum = DoublelengthFloat(0.0)
for i=1:n
zz=DoublelengthFloat(z[i])
dd=DoublelengthFloat(d[i])
thesum += zz^2/dd
end
thesum
end
for (fp, fpc) in [(DoublelengthFloat{Float64}, DoublelengthFloat)]
@eval begin
function g(n, d, z, ::Type{$fp})
thesum = $fpc(0.0)#convert($fp, 0.0)
for i=1:n
zz=$fpc(z[i])#zz=convert($fp, z[i])
dd=$fpc(d[i])#dd=convert($fp, d[i])
thesum += zz^2/dd
end
thesum
end
function h(n, d, z, ::Type{$fp})
thesum = convert($fp, 0.0)
for i=1:n
zz=convert($fp, z[i])
dd=convert($fp, d[i])
thesum += zz^2/dd
end
thesum
end
end
end
f(1, [1.], [1.])
@time f(n, d, z)
for ff in [g, h]
ff(1, [1.], [1.], DoublelengthFloat{Float64}) #Precompile
@time ff(n, d, z, DoublelengthFloat{Float64})
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment