Skip to content

Instantly share code, notes, and snippets.

@rygorous
rygorous / nvgl_kmd_crash.cpp
Created February 11, 2023 00:22
Old bug repros from 2014.
#include <stdio.h>
#include <stdlib.h>
#include <GL/gl3w.h>
#include "util.h"
#include "radglx.h"
#define CHECK_RESULTS // turn this off and we'll keep running until the driver crashes.
#define MIN_ALIGN 1 // setting this to 128 (or higher) fixes the bug (both incorrect results and crash)
typedef unsigned char U8;
@rygorous
rygorous / gist:5c8aad95ef36c9cab3c9f2d6cfeedd8d
Created February 4, 2023 04:05
Estimate of work to finish bitstreams
mov rax, [codewords]
pext rax, [masks] ; coalesce code words
shlx rax, rax, rDestBitPos ; still need something like this
or rOut, rax ; and this
add rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it?
; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword
; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD
; our best bet here is, I guess, 8 16-bit codewords at once (AVX2)
;
@rygorous
rygorous / simd_multigetbits.cpp
Created February 3, 2023 08:37
Multigetbits, the second
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <smmintrin.h>
#ifdef __RADAVX__
#include <immintrin.h>
#endif
@rygorous
rygorous / simple_multigetbits.cpp
Created February 3, 2023 07:50
Multigetbits, the first
// Returns 8 bit fields at the given positions (in bits) and of the
// given widths as 16-bit integers, with the values aligned with the
// MSB at the top and garbage in the lower-order bits.
//
// The individual lens must be <=8, the positions are bit offsets
// into the 128-bit "bytes".
template<
int pos0, int len0,
int pos1, int len1,
int pos2, int len2,
@rygorous
rygorous / results.txt
Created November 9, 2022 09:56
Oodle Texture BC1 + RDO
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
test_bc1
reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga
auto alpha : RGBA with all Opaque : no alpha
CompressBCN : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s
writing: out_baseline.dds
t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112
vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D)
rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3
@rygorous
rygorous / results.txt
Created November 9, 2022 09:44
Oodle Texture BC1 encoder at "Normal" and "High" levels vs ICBC (ICBC RMSEs when decoding for Intel)
C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel
icbc_test.cpp
Using 32 threads.
Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga': RMSE = 3.510 PSNR = 37.224 TIME = 0.052029 (0.052029)
Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga': RMSE = 4.406 PSNR = 35.251 TIME = 0.012941 (0.012941)
Average Results:
RMSE = 3.707 PSNR = 36.751 TIME = 0.064970 (0.064970)
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
@rygorous
rygorous / f2h.cpp
Created October 19, 2022 19:05
float<->half matching VCVTPS2PH exactly
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
// Float->half conversion with round-to-nearest-even, SSE2+
// leaves half-floats in 32-bit lanes (sign extended)
static inline __m128i F32_to_F16_4x(const __m128 &f)
{
const __m128 mask_sign = _mm_set1_ps(-0.0f);
const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf
// Scope guard to set up FP state as desired and reset it on exit
struct FPStateScope
{
U32 saved_state;
FPStateScope();
~FPStateScope();
};
// ...
@rygorous
rygorous / transpose.cpp
Created September 6, 2022 03:15
MSB-first -> LSB-first Huff table transpose (x86/SSE2 version)
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab
struct KrakenMSBHuffTab
{
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset
};
// NOTE: must match what the ASM inner loops expect (or disable them above)!
struct KrakenHuffElem
{
@rygorous
rygorous / rsqrtss_dump.cpp
Created August 11, 2022 19:43
Intel RSQRTSS logic
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <emmintrin.h>
static inline uint32_t test_func(uint32_t x)
{
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x));
__m128 value = _mm_rsqrt_ss(xs);
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value));