Created
June 22, 2012 09:37
-
-
Save rjeschke/2971698 to your computer and use it in GitHub Desktop.
128 bit floats in software (Java)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2012 René Jeschke <[email protected]> | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package jfloat; | |
import java.math.BigDecimal; | |
import java.math.BigInteger; | |
/* This thing is WIP. Currently basic mul/div is working, without | |
* proper rounding, though. Everything is quick'n'dirty ... | |
* | |
* Proof-of-concept and playground. | |
* | |
* 128 Bit: (4 * UI32) | |
* 1 sign | |
* 15 bit exponent, bias: 16383 | |
* 112 mantissa | |
*/ | |
public class Float128 | |
{ | |
private final static BigDecimal BD_2 = new BigDecimal(2); | |
private final static BigDecimal MAN_DIV = new BigDecimal(new BigInteger("10000000000000000000000000000", 16)); | |
public static int getSignBit(int[] num) | |
{ | |
return num[0] & 0x80000000; | |
} | |
public static int getExponent(int[] num) | |
{ | |
return ((num[0] >> 16) & 0x7fff) - 16383; | |
} | |
private static void carryAdd4(int[] a, int carry) | |
{ | |
int c = carry; | |
if(c == 0) | |
return; | |
for(int i = 3; i >= 0; i--) | |
{ | |
long t = (a[i] & 0xffffffffL) + c; | |
a[i] = (int)t; | |
c = (int)(t >> 32); | |
if(c == 0) | |
return; | |
} | |
} | |
private static int getSetBit8(int t) | |
{ | |
if((t & 0x80) != 0) | |
return 8; | |
if((t & 0x40) != 0) | |
return 7; | |
if((t & 0x20) != 0) | |
return 6; | |
if((t & 0x10) != 0) | |
return 5; | |
if((t & 0x08) != 0) | |
return 4; | |
if((t & 0x04) != 0) | |
return 3; | |
if((t & 0x02) != 0) | |
return 2; | |
if((t & 0x01) != 0) | |
return 1; | |
return 0; | |
} | |
private static int getSetBit(int t) | |
{ | |
if(t == 0) | |
return 0; | |
if((t & 0xff000000) != 0) | |
return getSetBit8(t >> 24) + 24; | |
if((t & 0x00ff0000) != 0) | |
return getSetBit8(t >> 16) + 16; | |
if((t & 0x0000ff00) != 0) | |
return getSetBit8(t >> 8) + 8; | |
if((t & 0x000000ff) != 0) | |
return getSetBit8(t); | |
return 0; | |
} | |
public static int[] div(int[] a, int[] b, int[] r) | |
{ | |
final int sign = (a[0] ^ b[0]) & 0x80000000; | |
final int e0 = (a[0] >> 16) & 0x7fff; | |
final int e1 = (b[0] >> 16) & 0x7fff; | |
// We flush denormals automatically | |
if(e0 == 0) | |
{ | |
r[0] = r[1] = r[2] = r[3] = 0; | |
return r; | |
} | |
// For now we throw up | |
if(e1 == 0) | |
{ | |
throw new ArithmeticException("The mighty Neet did not yet define division by zero"); | |
} | |
// k is either 7 or 8 (2^k bits of precision, so 7 should be enough) | |
// N = mant(a) >> 1 | |
// D = mand(b) >> 1 | |
// e = 1 - D | |
// Q = N | |
// for(i=0, k-1) | |
// Q = Q + Q * e | |
// e = e * e | |
// end | |
int[] e = new int[4]; | |
int[] q = new int[4]; | |
// 8.120 bit fixed point | |
e[0] = ((((b[0] & 0xffff) | 0x10000) << 7) | (b[1] >>> 25)) ^ 0xffffff; | |
e[1] = ((b[1] << 7) | (b[2] >>> 25)) ^ -1; | |
e[2] = ((b[2] << 7) | (b[3] >>> 25)) ^ -1; | |
e[3] = (b[3] << 7) ^ -1; | |
carryAdd4(e, 1); | |
q[0] = (((a[0] & 0xffff) | 0x10000) << 7) | (a[1] >>> 25); | |
q[1] = (a[1] << 7) | (a[2] >>> 25); | |
q[2] = (a[2] << 7) | (a[3] >>> 25); | |
q[3] = (a[3] << 7); | |
int[] ret = new int[8]; | |
for(int n = 0; n < 7; n++) | |
{ | |
long t; | |
mul8_120(q, e, ret); | |
ret[0] = (ret[0] << 8) | (ret[1] >>> 24); | |
ret[1] = (ret[1] << 8) | (ret[2] >>> 24); | |
ret[2] = (ret[2] << 8) | (ret[3] >>> 24); | |
ret[3] = (ret[3] << 8) | (ret[4] >>> 24); | |
t = (q[3] & 0xffffffffL) + (ret[3] & 0xffffffffL); | |
q[3] = (int)t; | |
t = (q[2] & 0xffffffffL) + (ret[2] & 0xffffffffL) + (t >>> 32); | |
q[2] = (int)t; | |
t = (q[1] & 0xffffffffL) + (ret[1] & 0xffffffffL) + (t >>> 32); | |
q[1] = (int)t; | |
t = (q[0] & 0xffffffffL) + (ret[0] & 0xffffffffL) + (t >>> 32); | |
q[0] = (int)t; | |
mul8_120(e, e, ret); | |
e[0] = (ret[0] << 8) | (ret[1] >>> 24); | |
e[1] = (ret[1] << 8) | (ret[2] >>> 24); | |
e[2] = (ret[2] << 8) | (ret[3] >>> 24); | |
e[3] = (ret[3] << 8) | (ret[4] >>> 24); | |
} | |
final int sb = getSetBit(q[0]); | |
final int exp = e0 - e1 + sb + 16383 - 25; | |
if(exp <= 0) | |
{ | |
r[0] = r[1] = r[2] = r[3] = 0; | |
return r; | |
} | |
// TODO +/- inf | |
final int s0 = sb - 17; | |
final int s1 = 32 - s0; | |
r[0] = sign | (exp << 16) | ((q[0] >> s0) & 0xffff); | |
r[1] = (q[0] << s1) | (q[1] >>> s0); | |
r[2] = (q[1] << s1) | (q[2] >>> s0); | |
r[3] = (q[2] << s1) | (q[3] >>> s0); | |
return r; | |
} | |
public static int[] mul8_120(int[] a, int[] b, int[] ret) | |
{ | |
long t, carry, sum; | |
t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
carry = t >>> 32; | |
ret[7] = (int)t; | |
t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
carry += sum >>> 32; | |
ret[6] = (int)sum; | |
t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[5] = (int)sum; | |
t = (a[0] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[3] & 0xffffffffL) * (b[0] & 0xffffffffL); | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[4] = (int)sum; | |
t = (a[0] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[2] & 0xffffffffL) * (b[0] & 0xffffffffL); | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[3] = (int)sum; | |
t = (a[0] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * (b[0] & 0xffffffffL); | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[2] = (int)sum; | |
t = (a[0] & 0xffffffffL) * (b[0] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
ret[1] = (int)sum; | |
ret[0] = (int)((t >>> 32) + (sum >>> 32)); | |
return ret; | |
} | |
public static int[] mul(int[] a, int[] b, int[] r) | |
{ | |
final int sign = (a[0] ^ b[0]) & 0x80000000; | |
final int e0 = (a[0] >> 16) & 0x7fff; | |
final int e1 = (b[0] >> 16) & 0x7fff; | |
// We flush denormals automatically | |
if(e0 == 0 || e1 == 0) | |
{ | |
r[0] = r[1] = r[2] = r[3] = 0; | |
return r; | |
} | |
final int[] temp = mantMul(a, b, new int[8]); | |
final int s = (temp[0] >> 1); | |
final int exp = e0 + e1 - 16383 + s; | |
if(exp <= 0) | |
{ | |
r[0] = r[1] = r[2] = r[3] = 0; | |
return r; | |
} | |
// TODO +/- inf | |
final int s0 = 16 - s; | |
final int s1 = 32 - s0; | |
r[0] = sign | (exp << 16) | (((temp[0] << s0) | (temp[1] >>> s1)) & 0xffff); | |
r[1] = (temp[1] << s0) | (temp[2] >>> s1); | |
r[2] = (temp[2] << s0) | (temp[3] >>> s1); | |
r[3] = (temp[3] << s0) | (temp[4] >>> s1); | |
return r; | |
} | |
private static int[] mantMul(int[] a, int[] b, int[] ret) | |
{ | |
long t, carry, sum; | |
final int a0 = (a[0] & 0xffff) | 0x10000; | |
final int b0 = (b[0] & 0xffff) | 0x10000; | |
t = (a[3] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
carry = t >>> 32; | |
ret[7] = (int)t; | |
t = (a[2] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[3] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
carry += sum >>> 32; | |
ret[6] = (int)sum; | |
t = (a[1] & 0xffffffffL) * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[2] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[3] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[5] = (int)sum; | |
t = a0 * (b[3] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * (b[2] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[2] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[3] & 0xffffffffL) * b0; | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[4] = (int)sum; | |
t = a0 * (b[2] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * (b[1] & 0xffffffffL); | |
carry += t >>> 32; | |
sum += t & 0xffffffffL; | |
t = (a[2] & 0xffffffffL) * b0; | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[3] = (int)sum; | |
t = a0 * (b[1] & 0xffffffffL); | |
sum = (t & 0xffffffffL) + carry; | |
carry = t >>> 32; | |
t = (a[1] & 0xffffffffL) * b0; | |
sum += t & 0xffffffffL; | |
carry += (t >>> 32) + (sum >>> 32); | |
ret[2] = (int)sum; | |
t = (long)a0 * (long)b0; | |
sum = (t & 0xffffffffL) + carry; | |
ret[1] = (int)sum; | |
ret[0] = (int)((t >>> 32) + (sum >>> 32)); | |
return ret; | |
} | |
public static int[] fromDouble(double v, int[] n) | |
{ | |
long bits = Double.doubleToRawLongBits(v); | |
int sign = (int)((bits >> 63) & 1); | |
int exp = ((int)(bits >> 52) & 2047) - 1023; | |
int m0 = (int)(bits >> 36) & 0xffff; | |
int m1 = (int)(bits >> 4); | |
int m2 = (int)(bits & 15) << 28; | |
n[0] = (sign << 31) | ((exp + 16383) << 16) | m0; | |
n[1] = m1; | |
n[2] = m2; | |
n[3] = 0; | |
return n; | |
} | |
private static String mantissaToHex(int[] n) | |
{ | |
return String.format(n[0] < 0 ? "-%x%08x%08x%08x" : "%x%08x%08x%08x", (n[0] & 0xffff) | 0x10000, n[1], n[2], n[3]); | |
} | |
public static BigDecimal toBigDecimal(int[] n) | |
{ | |
final BigDecimal mant = new BigDecimal(new BigInteger(mantissaToHex(n), 16)).divide(MAN_DIV); | |
int exp = getExponent(n); | |
final BigDecimal em = exp < 0 ? BigDecimal.ONE.divide(BD_2.pow(-exp)) : BD_2.pow(exp); | |
return mant.multiply(em); | |
} | |
public static String toString(int[] n) | |
{ | |
if((n[0] & 0x7fff0000) == 0) | |
return "0"; | |
return toBigDecimal(n).toString(); | |
} | |
public static String toHexString(int[] n) | |
{ | |
StringBuilder sb = new StringBuilder(); | |
for(int i = 0; i < n.length; i++) | |
sb.append(String.format(i == 0 ? "%08x" : "%08x", n[i])); | |
return sb.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment