Created
March 30, 2012 05:09
-
-
Save rygorous/2246678 to your computer and use it in GitHub Desktop.
float->sRGB8 for ISPC 1.2.0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// float->sRGB8 conversions - two variants. | |
// by Fabian "ryg" Giesen | |
// | |
// I hereby place this code in the public domain. | |
// | |
// Both variants come with absolute error bounds and a reversibility and monotonicity | |
// guarantee. They should pass D3D10 conformance testing. | |
// | |
// This is an ISPC port of https://gist.github.com/2203834 - see there for a test | |
// driver and code that computes the tables. | |
static inline int select(int a, int b, int mask) | |
{ | |
return mask ? a : b; | |
} | |
// This is the version that tries to use a small table (4 cache lines at 64 bytes/line) | |
// at the expense of a few extra instructions. Use "var2" below for a version with | |
// less instructions that uses a somewhat larger table. | |
// | |
// Float is semi-logarithmic. | |
// Linear x->sRGB for x >= 0.0031308 is (mostly) a power function which we | |
// approximate with a bunch of linear segments based on exponent and 3 highest | |
// bits of mantissa (2 was too inaccurate). | |
// | |
// Which exponents do we care about? | |
// Exponent >= 0: value was >=1, so we return 255 (in fact, we min with 1.0f-eps, so this never happens anyway). | |
// Exponent < -9: x < 1/512 which is well into the linear part of the sRGB mapping function. | |
// So the interesting exponent range is [-9,-1]. | |
// | |
// To get a pow2-sized range, we cheat a bit and only store anchors for linear segments in | |
// the exponent range [-8,-1], using linear sRGB part of the formula until 1/256. | |
// This means that we treat a small part of the nonlinear range (namely, the interval | |
// [0.0031308,0.00390625]) as linear. Our linear scale value needs to be adjusted for this. | |
// This is done simply by starting from the "correct" scale value (255*12.92, 0x454de99a) | |
// and doing a binary search for the value that gives the best results (=lowest max error | |
// in this case) across the range we care about. | |
// | |
// The table itself has a bias in the top 16 bits and a scale factor for the linear function | |
// (based on the next 8 mantissa bits after the 3 we already used). Both are scaled to make | |
// good use of the available bits. The format was chosen this way so the linear function | |
// can be evaluated with a single PMADDWD after the mantissa bits were extracted - okay, we do | |
// need to insert one more set bit in the high half for the bias part to work. | |
// These coefficients were determined simply by doing a least-squares fit of a linear function | |
// f(x) = a+b*x inside each "bucket" (see table-making functions below). | |
// | |
// Max error for whole function (integer-rounded result minus "exact" value, as computed in | |
// floats using the official formula): 0.573277 at 0x3b7a88c6 | |
int float_to_srgb8_var1(float in) | |
{ | |
static const uniform unsigned int table[64] = { | |
0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143, | |
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af, | |
0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240, | |
0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300, | |
0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, | |
0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559, | |
0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723, | |
0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2, | |
}; | |
static const uniform unsigned int almost_one = 0x3f7fffff; | |
static const uniform unsigned int linear_scale = 0x454c5d00; | |
static const uniform unsigned int float2int_magic = 0x4b000000; | |
// Clamp to [0, 1-eps]; these two values map to 0 and 1, respectively. | |
in = max(in, 0.0f); | |
in = min(in, floatbits(almost_one)); | |
// Linear case | |
float flinear = in * linear_scale; | |
int ilinear = intbits(in + floatbits(float2int_magic)) & 0xff; | |
// Non-linear case | |
// Unpack bias, scale from table | |
unsigned int tab = table[(intbits(in) >> 20) & 63]; | |
unsigned int bias = (tab >> 16) << 9; | |
unsigned int scale = tab & 0xffff; | |
// Grab next-highest mantissa bits and perform linear interpolation | |
unsigned int t = (intbits(in) >> 12) & 0xff; | |
int itable = (bias + scale*t) >> 16; | |
return select(itable, ilinear, in < 1.0f / 256.0f); | |
} | |
// This version uses a larger table than the code above (104 entries at 4 bytes each, | |
// or 6.5 cache lines at 64b/line) but is conceptually simpler and needs less instructions. | |
// | |
// The basic ideas are still the same, only this time, we squeeze everything into the | |
// table, even the linear part of the range; since we are approximating the function as | |
// piecewise linear anyway, this is fairly easy. | |
// | |
// In the exact version of the conversion, any value that produces an output float less | |
// than 0.5 will be rounded to an integer of zero. Inverting the linear part of the transform, | |
// we get: | |
// | |
// log2(0.5 / (255 * 12.92)) =~ -12.686 | |
// | |
// which in turn means that any value smaller than about 2^(-12.687) will return 0. | |
// What this means is that we can adapt the clamping code to just clamp to | |
// [2^(-13), 1-eps] and we're covered. This means our table needs to cover a range of | |
// 13 different exponents from -13 to -1. | |
// | |
// The table lookup, storage and interpolation works exactly the same way as in the code | |
// above. | |
// | |
// Max error for the whole function (integer-rounded result minus "exact" value, as computed in | |
// floats using the official formula): 0.544403 at 0x3e9f8000 | |
int float_to_srgb8_var2(float in) | |
{ | |
static const uniform unsigned int table[104] = { | |
0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d, | |
0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a, | |
0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033, | |
0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067, | |
0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5, | |
0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2, | |
0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143, | |
0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af, | |
0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240, | |
0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300, | |
0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, | |
0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559, | |
0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723, | |
}; | |
static const uniform unsigned int almost_one = 0x3f7fffff; | |
// Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively. | |
in = max(in, 0.0f); | |
in = min(in, floatbits(almost_one)); | |
// Do the table lookup and unpack bias, scale | |
unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20]; | |
unsigned int bias = (tab >> 16) << 9; | |
unsigned int scale = tab & 0xffff; | |
// Grab next-highest mantissa bits and perform linear interpolation | |
unsigned int t = (intbits(in) >> 12) & 0xff; | |
return (bias + scale*t) >> 16; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment