Skip to content

Instantly share code, notes, and snippets.

@milhidaka
Created March 13, 2019 04:05
Show Gist options
  • Save milhidaka/95863906fe828198f47991c813dbe233 to your computer and use it in GitHub Desktop.
Save milhidaka/95863906fe828198f47991c813dbe233 to your computer and use it in GitHub Desktop.
float16 -> float32 conversion in C
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#define DATA_SIZE 2052
float decode(uint16_t float16_value)
{
// MSB -> LSB
// float16=1bit: sign, 5bit: exponent, 10bit: fraction
// float32=1bit: sign, 8bit: exponent, 23bit: fraction
// for normal exponent(1 to 0x1e): value=2**(exponent-15)*(1.fraction)
// for denormalized exponent(0): value=2**-14*(0.fraction)
uint32_t sign = float16_value >> 15;
uint32_t exponent = (float16_value >> 10) & 0x1F;
uint32_t fraction = (float16_value & 0x3FF);
uint32_t float32_value;
if (exponent == 0)
{
if (fraction == 0)
{
// zero
float32_value = (sign << 31);
}
else
{
// can be represented as ordinary value in float32
// 2 ** -14 * 0.0101
// => 2 ** -16 * 1.0100
// int int_exponent = -14;
exponent = 127 - 14;
while ((fraction & (1 << 10)) == 0)
{
//int_exponent--;
exponent--;
fraction <<= 1;
}
fraction &= 0x3FF;
// int_exponent += 127;
float32_value = (sign << 31) | (exponent << 23) | (fraction << 13);
}
}
else if (exponent == 0x1F)
{
/* Inf or NaN */
float32_value = (sign << 31) | (0xFF << 23) | (fraction << 13);
}
else
{
/* ordinary number */
float32_value = (sign << 31) | ((exponent + (127-15)) << 23) | (fraction << 13);
}
return *((float*)&float32_value);
}
int main(void)
{
uint16_t float16_data[DATA_SIZE];
float float32_data[DATA_SIZE];
FILE* fr = fopen("float16.bin", "rb");
assert(fr != NULL);
size_t loaded = fread(float16_data, sizeof(uint16_t), DATA_SIZE, fr);
assert(loaded == DATA_SIZE);
fclose(fr);
for (int i = 0; i < DATA_SIZE; i++)
{
float32_data[i] = decode(float16_data[i]);
}
FILE* fw = fopen("float32_decoded.bin", "wb");
assert(fw != NULL);
size_t saved = fwrite(float32_data, sizeof(float), DATA_SIZE, fw);
assert(saved == DATA_SIZE);
fclose(fw);
}
import numpy as np
np.random.seed(1)
data_small = np.random.normal(scale=1e-2, size=(1024,)).astype(np.float16)
data_large = np.random.normal(scale=1e2, size=(1024,)).astype(np.float16)
data_special = np.array([0 / 1, 0 / -1, np.inf, -np.inf], dtype=np.float16)
data = np.concatenate((data_small, data_large, data_special))
data.tofile("float16.bin")
data.astype(np.float32).tofile("float32.bin")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment