Skip to content

Instantly share code, notes, and snippets.

@JeffreySarnoff
Last active August 19, 2022 03:40
Show Gist options
  • Save JeffreySarnoff/fe3420bb1d518310f3c7306e342231aa to your computer and use it in GitHub Desktop.
Save JeffreySarnoff/fe3420bb1d518310f3c7306e342231aa to your computer and use it in GitHub Desktop.
Arm log function
#=
* Data for log.
*
* Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
=#
const OneAsUInt64 = 0x3ff0000000000000
const LOG_TABLE_BITS = 7
const LOG_POLY_ORDER = 6
const LOG_POLY1_ORDER = 12
const N4LOG = (1 << LOG_TABLE_BITS)
const ln2hi = 0x1.62e42fefa3800p-1
const ln2lo = 0x1.ef35793c76730p-45
const Ln2hi = ln2hi
const Ln2lo = ln2lo
const OFF = 0x3fe6000000000000
#=
julia> LO, HI
0x3fee000000000000, 0x3fee000000000000,
0.9375, 1.064697265625
=#
const LO = 0x3fee000000000000 # asuint64(1.0 - 0x1p-4)
const HI = 0x3ff1090000000000 # asuint64(1.0 + 0x1.09p-4)
const HIminusLO = HI - LO # 4.22090426772484e-309
if LOG_POLY1_ORDER == 10
# relative error: 0x1.32eccc6p-62
# in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
const log_poly1_coeffs = (
-0x1p-1,
0x1.55555555554e5p-2,
-0x1.0000000000af2p-2,
0x1.9999999bbe436p-3,
-0x1.55555537f9cdep-3,
0x1.24922fc8127cfp-3,
-0x1.0000b7d6bb612p-3,
0x1.c806ee1ddbcafp-4,
-0x1.972335a9c2d6ep-4,
);
elseif LOG_POLY1_ORDER == 11
# relative error: 0x1.52c8b708p-68
# in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
const log_poly1_coeffs = (
-0x1p-1,
0x1.5555555555555p-2,
-0x1.ffffffffffea9p-3,
0x1.999999999c4d4p-3,
-0x1.55555557f5541p-3,
0x1.249248fbe33e4p-3,
-0x1.ffffc9a3c825bp-4,
0x1.c71e1f204435dp-4,
-0x1.9a7f26377d06ep-4,
0x1.71c30cf8f7364p-4,
);
elseif LOG_POLY1_ORDER == 12
# relative error: 0x1.c04d76cp-63
# in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
const log_poly1_coeffs = (
-0x1p-1,
0x1.5555555555577p-2,
-0x1.ffffffffffdcbp-3,
0x1.999999995dd0cp-3,
-0x1.55555556745a7p-3,
0x1.24924a344de3p-3,
-0x1.fffffa4423d65p-4,
0x1.c7184282ad6cap-4,
-0x1.999eb43b068ffp-4,
0x1.78182f7afd085p-4,
-0x1.5521375d145cdp-4,
);
end
if N4LOG == 64 && LOG_POLY_ORDER == 7
# relative error: 0x1.906eb8ap-58
# abs error: 0x1.d2cad5a8p-67
# in -0x1.fp-8 0x1.fp-8
const log_poly_coeffs = (
-0x1.0000000000027p-1,
0x1.555555555556ap-2,
-0x1.fffffff0440bap-3,
0x1.99999991906c3p-3,
-0x1.555c8d7e8201ep-3,
0x1.24978c59151fap-3,
);
elseif N4LOG == 128 && LOG_POLY_ORDER == 6
# relative error: 0x1.926199e8p-56
# abs error: 0x1.882ff33p-65
# in -0x1.fp-9 0x1.fp-9
const log_poly_coeffs = (
-0x1.0000000000001p-1,
0x1.555555551305bp-2,
-0x1.fffffffeb459p-3,
0x1.999b324f10111p-3,
-0x1.55575e506c89fp-3,
);
elseif N4LOG == 128 && LOG_POLY_ORDER == 7
# relative error: 0x1.649fc4bp-64
# abs error: 0x1.c3b5769p-74
# in -0x1.fp-9 0x1.fp-9
const log_poly_coeffs = (
-0x1.0000000000001p-1,
0x1.5555555555556p-2,
-0x1.fffffffea1a8p-3,
0x1.99999998e9139p-3,
-0x1.555776801b968p-3,
0x1.2493c29331a5cp-3,
);
end
const A = log_poly_coeffs
#=
Algorithm:
x = 2^k z
log(x) = k ln2 + log(c) + log(z/c)
log(z/c) = poly(z/c - 1)
where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
into the ith one, then table entries are computed as
tab[i].invc = 1/c
tab[i].logc = (double)log(c)
tab2[i].chi = (double)c
tab2[i].clo = (double)(c - (double)c)
where c is near the center of the subinterval and is chosen by trying +-2^29
floating point invc candidates around 1/center and selecting one for which
1) the rounding error in 0x1.8p9 + logc is 0,
2) the rounding error in z - chi - clo is < 0x1p-66 and
3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
a single rounding error when there is no fast fma for z*invc - 1,
3) ensures that logc + poly(z/c - 1) has small error, however near x == 1 when
|log(x)| < 0x1p-4, this is not enough so that is special cased.
=#
const log_table = (
(0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2),
(0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2),
(0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2),
(0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2),
(0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2),
(0x1.69147332f0cbap+0, -0x1.602d076180000p-2),
(0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2),
(0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2),
(0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2),
(0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2),
(0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2),
(0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2),
(0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2),
(0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2),
(0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2),
(0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2),
(0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2),
(0x1.52aff42064583p+0, -0x1.1e9e129279000p-2),
(0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2),
(0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2),
(0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2),
(0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2),
(0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2),
(0x1.4880524d48434p+0, -0x1.feb224586f000p-3),
(0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3),
(0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3),
(0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3),
(0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3),
(0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3),
(0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3),
(0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3),
(0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3),
(0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3),
(0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3),
(0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3),
(0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3),
(0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3),
(0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3),
(0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3),
(0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3),
(0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3),
(0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3),
(0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3),
(0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3),
(0x1.293726014b530p+0, -0x1.31b996b490000p-3),
(0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3),
(0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3),
(0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3),
(0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3),
(0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3),
(0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4),
(0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4),
(0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4),
(0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4),
(0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4),
(0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4),
(0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4),
(0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4),
(0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4),
(0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4),
(0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4),
(0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4),
(0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4),
(0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4),
(0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5),
(0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5),
(0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5),
(0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5),
(0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5),
(0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5),
(0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5),
(0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5),
(0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6),
(0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6),
(0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6),
(0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6),
(0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7),
(0x1.02865137932a9p+0, -0x1.419355daa0000p-7),
(0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8),
(0x1.008040614b195p+0, -0x1.0040979240000p-9),
(0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9),
(0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7),
(0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6),
(0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6),
(0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5),
(0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5),
(0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5),
(0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5),
(0x1.e01e009609a56p-1, 0x1.07598e598c000p-4),
(0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4),
(0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4),
(0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4),
(0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4),
(0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4),
(0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4),
(0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4),
(0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4),
(0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3),
(0x1.bf583eeece73fp-1, 0x1.147858292b000p-3),
(0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3),
(0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3),
(0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3),
(0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3),
(0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3),
(0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3),
(0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3),
(0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3),
(0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3),
(0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3),
(0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3),
(0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3),
(0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3),
(0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3),
(0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3),
(0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3),
(0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3),
(0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2),
(0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2),
(0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2),
(0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2),
(0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2),
(0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2),
(0x1.8060195f40260p-1, 0x1.2595fd7636800p-2),
(0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2),
(0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2),
(0x1.79baa679725c2p-1, 0x1.377266dec1800p-2),
(0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2),
(0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2),
);
const T = log_table
# >>>> functions
# =========
# Float64 -> UInt64
@inline function asuint64(x::Float64)
reinterpret(UInt64, x)
end
# Top 16 bits of a Float64 as a UInt32
@inline function top16(x::Float64)
(asuint64(x) >> 48) % UInt32
end
const Ccoeffs = (0.0, 0.3333333333333352, -0.24999999999998432, 0.19999999999320328, -0.16666666669929706, 0.14285715076560868, -0.12499997863982555, 0.11110712032936046, -0.10000486757818193, 0.09181994006195467, -0.08328363062289341)
function log_near1(x::Float64)
ix = asuint64(x)
top = top16(x)
if ix === OneAsUInt64
return zero(Float64)
end
r = x - 1.0
r2 = r * r
# r3 = r * r2
y = evalpoly(r, Ccoeffs) * r2
# Worst-case error is around 0.507 ULP
w = r * 0x1p27
rhi = (r + w) - w #
rlo = r - rhi
w = rhi * rhi * (-0.5) # B[0+1] # B[0+1] == -0.5
hi = r + w
lo = r - hi + w
lo += (-0.5) * rlo * (rhi + r) # B[0+1] * rlo * (rhi + r)
y += lo
y += hi
y
end
const Acoeffs = (-0.5000000000000001, 0.33333333331825593, -0.2499999999622955, 0.20000304511814496, -0.16667054827627667)
# Oscar Smith fixed this
function faster_log(x::Float64) # if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
ix = reinterpret(UInt64, x)
top = top16(x)
if ix - LO < HI - LO
return log_near1(x)
elseif (top - 0x0010) >= (0x7ff0 - 0x0010)
x < 0 && Base.Math.throw_complex_domainerror(:log, x)
iszero(x) && return -Inf
isfinite(x) || return x
# x is subnormal, normalize it.
ix = reinterpret(UInt64, x * 0x1p52)
ix -= (52 % UInt64) << 52
end
#=
x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center.
=#
tmp = ix - OFF
i = Int(1 + (tmp >> 45) & 127) # lookup table index
k = reinterpret(Int64, tmp) >> 52 #(tmp % Int64) >> 52 # arithmetic shift
iz = ix - (tmp & ~Base.significand_mask(Float64))
invc, logc = getfield(LOG_TABLE, i)
z = reinterpret(Float64, iz)
# log(x) = log1p(z/c-1) + log(c) + k*Ln2.
# r ~= z/c - 1, |r| < 1/(2*N4LOG).
# rounding error: 0x1p-55/N.
r = fma(z, invc, -1.0)
kd = Float64(k)
# hi + lo = r + log(c) + k*Ln2
# w = kd * Ln2hi + logc
w = fma(kd, Ln2hi, logc)
hi = w + r
lo = w - hi + fma(kd, Ln2lo, r) # #lo = w - hi + r + kd * Ln2lo
# log(x) = lo + (log1p(r) - r) + hi
r2 = r * r # rounding error: 0x1p-54/N4LOG^2
# Worst case error if |y| > 0x1p-5:
# 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
# Worst case error if |y| > 0x1p-4:
# 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).
# r*(r^4*A[4] + r^3*A[3] + r^2*A[2] + r*A[1] + A[0])
y = muladd(evalpoly(r, Acoeffs), r2, lo)
# y = lo + r2 * A[0+1] + r * r2 * (A[1+1] + r * A[2+1] + r2 * (A[3+1] + r * A[4+1])) + hi
return y + hi
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment