Created
April 11, 2017 06:57
-
-
Save petamoriken/26d8d3517a93d032d7674eaf6c720ea1 to your computer and use it in GitHub Desktop.
half precision float round function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const buffer = new ArrayBuffer(4); | |
const floatView = new Float32Array(buffer); | |
const uint32View = new Uint32Array(buffer); | |
const baseTable = new Uint32Array(512); | |
const shiftTable = new Uint32Array(512); | |
for(let i = 0; i < 256; ++i) { | |
const e = i - 127; | |
// very small number (0, -0) | |
if(e < -27) { | |
baseTable[i | 0x000] = 0x0000; | |
baseTable[i | 0x100] = 0x8000; | |
shiftTable[i | 0x000] = 24; | |
shiftTable[i | 0x100] = 24; | |
// small number (denorm) | |
} else if(e < -14) { | |
baseTable[i | 0x000] = 0x0400 >> (-e - 14); | |
baseTable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000; | |
shiftTable[i | 0x000] = -e - 1; | |
shiftTable[i | 0x100] = -e - 1; | |
// normal number | |
} else if(e <= 15) { | |
baseTable[i | 0x000] = (e + 15) << 10; | |
baseTable[i | 0x100] = ((e + 15) << 10) | 0x8000; | |
shiftTable[i | 0x000] = 13; | |
shiftTable[i | 0x100] = 13; | |
// large number (Infinity, -Infinity) | |
} else if(e < 128) { | |
baseTable[i | 0x000] = 0x7c00; | |
baseTable[i | 0x100] = 0xfc00; | |
shiftTable[i | 0x000] = 24; | |
shiftTable[i | 0x100] = 24; | |
// stay (NaN, Infinity, -Infinity) | |
} else { | |
baseTable[i | 0x000] = 0x7c00; | |
baseTable[i | 0x100] = 0xfc00; | |
shiftTable[i | 0x000] = 13; | |
shiftTable[i | 0x100] = 13; | |
} | |
} | |
/** | |
* round a number to a half float number bits. | |
* @param {number} num | |
* @see {@link ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf} | |
*/ | |
function roundToFloat16Bits(num) { | |
floatView[0] = num; | |
const f = uint32View[0]; | |
const e = (f >> 23) & 0x1ff; | |
return baseTable[e] + ((f & 0x007fffff) >> shiftTable[e]); | |
} | |
const mantissaTable = new Uint32Array(2048); | |
const exponentTable = new Uint32Array(64); | |
const offsetTable = new Uint32Array(64); | |
// mantissa | |
mantissaTable[0] = 0; | |
for(let i = 1; i < 1024; ++i) { | |
let m = i << 13; // zero pad mantissa bits | |
let e = 0; // zero exponent | |
// normalized | |
while((m & 0x00800000) === 0) { | |
e -= 0x00800000; // decrement exponent | |
m <<= 1; | |
} | |
m &= ~0x00800000; // clear leading 1 bit | |
e += 0x38800000; // adjust bias | |
mantissaTable[i] = m | e; | |
} | |
for(let i = 1024; i < 2048; ++i) { | |
mantissaTable[i] = 0x38000000 + ((i - 1024) << 13); | |
} | |
// exponent | |
exponentTable[0] = 0; | |
for(let i = 1; i < 31; ++i) { | |
exponentTable[i] = i << 23; | |
} | |
exponentTable[31] = 0x47800000; | |
exponentTable[32] = 0x80000000; | |
for(let i = 33; i < 63; ++i) { | |
exponentTable[i] = 0x80000000 + ((i - 32) << 23); | |
} | |
exponentTable[63] = 0xc7800000; | |
// offset | |
offsetTable[0] = 0; | |
for(let i = 1; i < 64; ++i) { | |
if(i === 32) { | |
offsetTable[i] = 0; | |
} else { | |
offsetTable[i] = 1024; | |
} | |
} | |
/** | |
* convert a half float number bits to a number. | |
* @param {number} h - half float number bits | |
* @see {@link ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf} | |
*/ | |
function convertNumber(h) { | |
const m = h >> 10; | |
uint32View[0] = mantissaTable[offsetTable[m] + (h & 0x3ff)] + exponentTable[m]; | |
return floatView[0]; | |
} | |
/** | |
* returns the nearest half precision float representation of a number. | |
* @param {number} num | |
*/ | |
function hfround(num) { | |
num = Number(num); | |
// for optimization (this if statement can be commented out) | |
if(!Number.isFinite(num) || num === 0) { | |
return num; | |
} | |
const x16 = roundToFloat16Bits(num); | |
return convertNumber(x16); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Math.fround のように倍精度浮動小数点数を半精度浮動小数点数に丸める函数。
Babel Repl