Created
January 20, 2021 16:02
-
-
Save vchuravy/7f7f21dadc93fb4f0aea3b5f236e360c to your computer and use it in GitHub Desktop.
Half-precision
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This file is a part of Julia. License is MIT: https://julialang.org/license | |
// float16 intrinsics | |
// TODO: use LLVM's compiler-rt | |
#include <stdint.h> | |
#include <string.h> // memcpy | |
#include <math.h> // memcpy | |
static inline float half_to_float(uint16_t ival) | |
{ | |
uint32_t sign = (ival & 0x8000) >> 15; | |
uint32_t exp = (ival & 0x7c00) >> 10; | |
uint32_t sig = (ival & 0x3ff) >> 0; | |
uint32_t ret; | |
if (exp == 0) { | |
if (sig == 0) { | |
sign = sign << 31; | |
ret = sign | exp | sig; | |
} | |
else { | |
int n_bit = 1; | |
uint16_t bit = 0x0200; | |
while ((bit & sig) == 0) { | |
n_bit = n_bit + 1; | |
bit = bit >> 1; | |
} | |
sign = sign << 31; | |
exp = ((-14 - n_bit + 127) << 23); | |
sig = ((sig & (~bit)) << n_bit) << (23 - 10); | |
ret = sign | exp | sig; | |
} | |
} | |
else if (exp == 0x1f) { | |
if (sig == 0) { // Inf | |
if (sign == 0) | |
ret = 0x7f800000; | |
else | |
ret = 0xff800000; | |
} | |
else // NaN | |
ret = 0x7fc00000 | (sign << 31) | (sig << (23 - 10)); | |
} | |
else { | |
sign = sign << 31; | |
exp = ((exp - 15 + 127) << 23); | |
sig = sig << (23 - 10); | |
ret = sign | exp | sig; | |
} | |
float fret; | |
memcpy(&fret, &ret, sizeof(float)); | |
return fret; | |
} | |
// float to half algorithm from: | |
// "Fast Half Float Conversion" by Jeroen van der Zijp | |
// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf | |
// | |
// With adjustments for round-to-nearest, ties to even. | |
static uint16_t basetable[512] = { | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000, | |
0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, | |
0x5000, 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, | |
0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, | |
0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, | |
0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, | |
0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, | |
0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; | |
static uint8_t shifttable[512] = { | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, | |
0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, | |
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, | |
0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x0d, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, | |
0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, | |
0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, | |
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, | |
0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, | |
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; | |
static inline uint16_t float_to_half(float param) | |
{ | |
uint32_t f; | |
memcpy(&f, ¶m, sizeof(float)); | |
if (isnan(param)) { | |
uint32_t t = 0x8000 ^ (0x8000 & ((uint16_t)(f >> 0x10))); | |
return t ^ ((uint16_t)(f >> 0xd)); | |
} | |
int i = ((f & ~0x007fffff) >> 23); | |
uint8_t sh = shifttable[i]; | |
f &= 0x007fffff; | |
// If `val` is subnormal, the tables are set up to force the | |
// result to 0, so the significand has an implicit `1` in the | |
// cases we care about. | |
f |= 0x007fffff + 0x1; | |
uint16_t h = (uint16_t)(basetable[i] + ((f >> sh) & 0x03ff)); | |
// round | |
// NOTE: we maybe should ignore NaNs here, but the payload is | |
// getting truncated anyway so "rounding" it might not matter | |
int nextbit = (f >> (sh - 1)) & 1; | |
if (nextbit != 0 && (h & 0x7C00) != 0x7C00) { | |
// Round halfway to even or check lower bits | |
if ((h & 1) == 1 || (f & ((1 << (sh - 1)) - 1)) != 0) | |
h += UINT16_C(1); | |
} | |
return h; | |
} | |
float __gnu_h2f_ieee(uint16_t param) | |
{ | |
return half_to_float(param); | |
} | |
float __extendhfsf2(uint16_t param) | |
{ | |
return half_to_float(param); | |
} | |
uint16_t __gnu_f2h_ieee(float param) | |
{ | |
return float_to_half(param); | |
} | |
uint16_t __truncdfhf2(double param) | |
{ | |
return float_to_half((float)param); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source_filename = "g" | |
target datalayout = "e-m:e-i64:64-n32:64" | |
target triple = "powerpc64le-unknown-linux-gnu" | |
declare i64 @llvm.ctlz.i64(i64, i1 immarg) #1 | |
define half @julia_g_219(i64 zeroext %0, i64 signext %1, i32 signext %2) { | |
top: | |
%3 = call i64 @llvm.ctlz.i64(i64 %0, i1 false) | |
%4 = icmp ugt i64 %0, 65535 | |
%5 = icmp slt i32 %2, 0 | |
%6 = sub i32 0, %2 | |
%7 = select i1 %5, i32 %6, i32 %2 | |
%8 = icmp sgt i32 %7, 1 | |
%value_phi.in = or i1 %4, %8 | |
br i1 %value_phi.in, label %L48, label %L14 | |
L14: ; preds = %top | |
%9 = sub nsw i64 52, %3 | |
%10 = icmp ult i64 %0, 2048 | |
%11 = lshr i64 %0, %9 | |
%12 = icmp ugt i64 %9, 63 | |
%13 = select i1 %12, i64 0, i64 %11 | |
%14 = sub nsw i64 0, %9 | |
%15 = shl i64 %0, %14 | |
%16 = icmp ugt i64 %14, 63 | |
%17 = select i1 %16, i64 0, i64 %15 | |
%18 = select i1 %10, i64 %17, i64 %13 | |
%19 = trunc i64 %18 to i16 | |
%20 = add i16 %19, 1 | |
%21 = lshr i16 %20, 1 | |
%22 = icmp eq i64 %9, %1 | |
%23 = zext i1 %22 to i16 | |
%24 = xor i16 %23, -1 | |
%25 = and i16 %21, %24 | |
%26 = trunc i64 %3 to i16 | |
%27 = shl i16 %26, 10 | |
%28 = sub i16 13312, %27 | |
%29 = add i16 %28, %25 | |
%30 = bitcast i16 %29 to half | |
br label %L48 | |
L48: ; preds = %L14, %top | |
%value_phi1 = phi half [ %30, %L14 ], [ 0xH7C00, %top ] | |
%31 = icmp sgt i32 %2, -1 | |
%32 = fpext half %value_phi1 to float | |
%33 = fneg float %32 | |
%34 = fptrunc float %33 to half | |
%35 = select i1 %31, half %value_phi1, half %34 | |
ret half %35 | |
} | |
define i32 @main() { | |
%val = call half @julia_g_219(i64 65520, i64 4, i32 1) | |
%cmp = fcmp oeq half %val, 0xH7C00 | |
%r = zext i1 %cmp to i32 | |
ret i32 %r | |
} | |
attributes #1 = { nounwind readnone speculatable willreturn } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment