castano · June 26, 2025 14:43 · hanfling · Apr 30, 2020 · castano · Sep 14, 2020
diff --git a/perfect-quantization-dxt-endpoints.txt b/perfect-quantization-dxt-endpoints.txt
 Perfect Quantization of DXT endpoints
 -------------------------------------

 One of the issues that affect the quality of most DXT compressors is the way floating point colors are rounded.

 For example, stb_dxt does:

    max16 =  (unsigned short)(stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11);
    max16 |= (unsigned short)(stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5);
    max16 |= (unsigned short)(stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0);

 And Rich's code also:

    lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
    lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
    lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);

 This is not the best approach. In DXT1 the RGB565 endpoints are not quantized using uniform intervals, so simply rounding 
 them to the nearest integer in the [0-31] or [0-63] range is not accurate. A better solution is to compute the midpoints 
 of the quantization intervals and round them up or down depending on whether the value is under or over the midpoint.

 RGB565 colors are converted to 8 bits using the following bit expansion:

 R8 = (R5 << 3) | (R5 >> 2)
 G8 = (G6 << 2) | (G6 >> 4)
 B8 = (B5 << 3) | (B5 >> 2)

 And we can compute the midpoints by simply averaging every two consecutive values:

 void init_tables() {
    for (int i = 0; i < 31; i++) {
        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
        midpoints5[i] = (f0 + f1) * 0.5;
    }
    midpoints5[31] = 1.0f;

    for (int i = 0; i < 63; i++) {
        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
        midpoints6[i] = (f0 + f1) * 0.5;
    }
    midpoints6[63] = 1.0f;
 }

 That results in the following tables:

 static const float midpoints5[32] = {
    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 
    0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 
    0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
 };

 static const float midpoints6[64] = {
    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 
    0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 
    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 
    0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 
    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 
    0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 
    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 
    0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
 };

 And you can use them as follows:

 // v is assumed to be in [0,1] range.
 static u16 vector3_to_color16(Vector3 v) {

    // Truncate.
    u16 r = u16(v.x * 31);
    u16 g = u16(v.y * 63);
    u16 b = u16(v.z * 31);

    // Round exactly according to 565 bit-expansion.
    r += (v.x > midpoints5[r]);
    g += (v.y > midpoints6[g]);
    b += (v.z > midpoints5[b]);

    return (r << 11) | (g << 5) | b;
 }

 Even though the differences are small, the correct rounding consistently produces more accurate results. Using the proposed
 rounding method in stb_dxt reduces the RMSE of my test image set as follows:

 stb              0.297230 - 0.296574 = 0.000656
 stb-hq           0.290318 - 0.289581 = 0.000737

 When using the cluster fit algorithm you can also use this method to evaluate the error of each cluster configuration. However,
 that only results in very small improvements:

 nvtt-hq          0.275365 - 0.275290 = 0.000075

 I don't know how to do the exact rounding efficiently using SIMD. In practice I don't use this approach in this case because 
 the peformance hit is much higher.
	Perfect Quantization of DXT endpoints
	-------------------------------------

	One of the issues that affect the quality of most DXT compressors is the way floating point colors are rounded.

	For example, stb_dxt does:

	max16 = (unsigned short)(stb__sclamp((At1_ryy - At2_rxy)*frb+0.5f,0,31) << 11);
	max16 \|= (unsigned short)(stb__sclamp((At1_gyy - At2_gxy)*fg +0.5f,0,63) << 5);
	max16 \|= (unsigned short)(stb__sclamp((At1_byy - At2_bxy)*frb+0.5f,0,31) << 0);

	And Rich's code also:

	lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31);
	lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63);
	lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31);

	This is not the best approach. In DXT1 the RGB565 endpoints are not quantized using uniform intervals, so simply rounding
	them to the nearest integer in the [0-31] or [0-63] range is not accurate. A better solution is to compute the midpoints
	of the quantization intervals and round them up or down depending on whether the value is under or over the midpoint.

	RGB565 colors are converted to 8 bits using the following bit expansion:

	R8 = (R5 << 3) \| (R5 >> 2)
	G8 = (G6 << 2) \| (G6 >> 4)
	B8 = (B5 << 3) \| (B5 >> 2)

	And we can compute the midpoints by simply averaging every two consecutive values:

	void init_tables() {
	for (int i = 0; i < 31; i++) {
	float f0 = float(((i+0) << 3) \| ((i+0) >> 2)) / 255.0f;
	float f1 = float(((i+1) << 3) \| ((i+1) >> 2)) / 255.0f;
	midpoints5[i] = (f0 + f1) * 0.5;
	}
	midpoints5[31] = 1.0f;

	for (int i = 0; i < 63; i++) {
	float f0 = float(((i+0) << 2) \| ((i+0) >> 4)) / 255.0f;
	float f1 = float(((i+1) << 2) \| ((i+1) >> 4)) / 255.0f;
	midpoints6[i] = (f0 + f1) * 0.5;
	}
	midpoints6[63] = 1.0f;
	}

	That results in the following tables:

	static const float midpoints5[32] = {
	0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f,
	0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
	0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f,
	0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
	};

	static const float midpoints6[64] = {
	0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f,
	0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
	0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f,
	0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
	0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f,
	0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
	0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f,
	0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
	};

	And you can use them as follows:

	// v is assumed to be in [0,1] range.
	static u16 vector3_to_color16(Vector3 v) {

	// Truncate.
	u16 r = u16(v.x * 31);
	u16 g = u16(v.y * 63);
	u16 b = u16(v.z * 31);

	// Round exactly according to 565 bit-expansion.
	r += (v.x > midpoints5[r]);
	g += (v.y > midpoints6[g]);
	b += (v.z > midpoints5[b]);

	return (r << 11) \| (g << 5) \| b;
	}

	Even though the differences are small, the correct rounding consistently produces more accurate results. Using the proposed
	rounding method in stb_dxt reduces the RMSE of my test image set as follows:

	stb 0.297230 - 0.296574 = 0.000656
	stb-hq 0.290318 - 0.289581 = 0.000737

	When using the cluster fit algorithm you can also use this method to evaluate the error of each cluster configuration. However,
	that only results in very small improvements:

	nvtt-hq 0.275365 - 0.275290 = 0.000075

	I don't know how to do the exact rounding efficiently using SIMD. In practice I don't use this approach in this case because
	the peformance hit is much higher.