Ignacio Castaño castano

Writing to Compressed Textures

In general it's not possible to use a block-compressed texture as a render target or as a compute shader output. Instead you have to either: Alias the block compressed texture with an uncompressed texture where each texel corresponds to a block, or to output the compressed blocks to an uncompressed texture buffer, and then copy the compressed blocks from that intermediate memory location to the final compressed texture.

Each of the graphics APIs expose this functionality in a different way. This document explains the options available under the following APIs:

Direct3D
Vulkan
Metal
OpenGL

	// Implements "Recursive Implementation of the Gaussian Filter Using Truncated Cosine Functions" by Charalampidis [2016].
	// https://discovery.researcher.life/article/recursive-implementation-of-the-gaussian-filter-using-truncated-cosine-functions/dcf24675f5eb30dba93c5205cdae3c40
	// This code is based on:
	// https://github.com/cloudinary/ssimulacra2/blob/main/src/lib/jxl/gauss_blur.cc
	// Copyright (c) the JPEG XL Project Authors. All rights reserved.

	struct RecursiveGaussian {
	RecursiveGaussian(float sigma);

	float mul_in[3];

	// Returns p bit that results in the lowest RGB quantization error.
	inline int quantize_rgb_bc7_mode6(float r, float g, float b, float * out_qr, float * out_qg, float * out_qb) {
	float qr = 2 * trunc(r * 127.5f);
	float qg = 2 * trunc(g * 127.5f);
	float qb = 2 * trunc(b * 127.5f);

	float dr = 255 * r - qr;
	float dg = 255 * g - qg;
	float db = 255 * b - qb;

	template <typename T>
	struct Compare {
	T lambda;

	#if _MSC_VER \|\| __APPLE__
	static int compare(void * cmp, const void * a, const void * b)
	#else
	static int compare(const void * a, const void * b, void * cmp)
	#endif
	{

	// This code is in the public domain -- Ignacio Castaño <[email protected]>

	#include "Sphere.h"
	#include "Vector.inl"
	#include "Box.inl"

	#include <float.h> // FLT_MAX

	const float radiusEpsilon = 1e-4f;

	// Emulating gathers using loads and permutevar8. This made the entire compressor about 15% faster. Both methods require AVX2.

	// Load 4 uint8 per lane.
	__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);

	if (count <= 8) {
	// Load r_sat in one register:
	Wide8 r07 = load8(r_sat);
	Wide8 g07 = load8(g_sat);
	Wide8 b07 = load8(b_sat);

	Wide8 besterror8 = broadcast8(FLT_MAX);
	Vector3_Wide8 beststart8;
	Vector3_Wide8 bestend8;

	// check all possible clusters for this total order
	for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
	{
	//uint c0 = s_fourCluster[i].c0;
	//uint c1 = s_fourCluster[i].c1;
	//uint c2 = s_fourCluster[i].c2;

	rgbcx v1.12
	icbc v1.0
	RMSE PSNR Time

	Kodak/Waterloo Image Set:

	rgbcx-0 8.128142 29.930977 0.163412
	rgbcx-1 8.041399 30.024172 0.182219
	rgbcx-2 8.198924 29.855667 0.213994
	rgbcx-3 8.005721 30.062794 0.242393

	Perfect Quantization of DXT endpoints
	-------------------------------------

	One of the issues that affect the quality of most DXT compressors is the way floating point colors are rounded.

	For example, stb_dxt does:

	max16 = (unsigned short)(stb__sclamp((At1_ryy - At2_rxy)*frb+0.5f,0,31) << 11);
	max16 \|= (unsigned short)(stb__sclamp((At1_gyy - At2_gxy)*fg +0.5f,0,63) << 5);
	max16 \|= (unsigned short)(stb__sclamp((At1_byy - At2_bxy)*frb+0.5f,0,31) << 0);

	RMSE PSNR Time

	Kodak/Waterloo Image Set:

	stb 8.202766 29.851597 0.258041
	stb-hq 8.009301 30.058910 0.284019
	nvtt-fast 8.089954 29.971882 0.445670

	nvtt 7.616215 30.496019 6.806233
	nvtt-hq 7.562366 30.557650 13.081200