This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <GL/gl3w.h> | |
#include "util.h" | |
#include "radglx.h" | |
#define CHECK_RESULTS // turn this off and we'll keep running until the driver crashes. | |
#define MIN_ALIGN 1 // setting this to 128 (or higher) fixes the bug (both incorrect results and crash) | |
typedef unsigned char U8; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mov rax, [codewords] | |
pext rax, [masks] ; coalesce code words | |
shlx rax, rax, rDestBitPos ; still need something like this | |
or rOut, rax ; and this | |
add rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it? | |
; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword | |
; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD | |
; our best bet here is, I guess, 8 16-bit codewords at once (AVX2) | |
; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <smmintrin.h> | |
#ifdef __RADAVX__ | |
#include <immintrin.h> | |
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Returns 8 bit fields at the given positions (in bits) and of the | |
// given widths as 16-bit integers, with the values aligned with the | |
// MSB at the top and garbage in the lower-order bits. | |
// | |
// The individual lens must be <=8, the positions are bit offsets | |
// into the 128-bit "bytes". | |
template< | |
int pos0, int len0, | |
int pos1, int len1, | |
int pos2, int len2, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds | |
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929 | |
test_bc1 | |
reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga | |
auto alpha : RGBA with all Opaque : no alpha | |
CompressBCN : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s | |
writing: out_baseline.dds | |
t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112 | |
vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D) | |
rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel | |
icbc_test.cpp | |
Using 32 threads. | |
Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga': RMSE = 3.510 PSNR = 37.224 TIME = 0.052029 (0.052029) | |
Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga': RMSE = 4.406 PSNR = 35.251 TIME = 0.012941 (0.012941) | |
Average Results: | |
RMSE = 3.707 PSNR = 36.751 TIME = 0.064970 (0.064970) | |
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2 | |
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <immintrin.h> | |
// Float->half conversion with round-to-nearest-even, SSE2+ | |
// leaves half-floats in 32-bit lanes (sign extended) | |
static inline __m128i F32_to_F16_4x(const __m128 &f) | |
{ | |
const __m128 mask_sign = _mm_set1_ps(-0.0f); | |
const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Scope guard to set up FP state as desired and reset it on exit | |
struct FPStateScope | |
{ | |
U32 saved_state; | |
FPStateScope(); | |
~FPStateScope(); | |
}; | |
// ... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab | |
struct KrakenMSBHuffTab | |
{ | |
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset | |
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset | |
}; | |
// NOTE: must match what the ASM inner loops expect (or disable them above)! | |
struct KrakenHuffElem | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdint.h> | |
#include <emmintrin.h> | |
static inline uint32_t test_func(uint32_t x) | |
{ | |
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x)); | |
__m128 value = _mm_rsqrt_ss(xs); | |
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value)); |