Last active
October 8, 2024 03:47
-
-
Save mmozeiko/e66f6d23e101b1b9c37cb3d9d10727f5 to your computer and use it in GitHub Desktop.
uncompressed png writer & reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
// uncompressed png writer & reader | |
// supports only 8-bit and 16-bit formats | |
// Performance comparison for 8192x8192 BGRA8 image (256MB) | |
// Compiled with "clang -O2", AVX2 requires extra "-mavx2" or "/arch:AVX2" argument | |
// | |
// For libpng (compressed) uses default libpng/zlib compression settings | |
// For libpng (uncompressed) case following two functions are used: | |
// png_set_compression_level() with Z_NO_COMPRESSION | |
// png_set_filter() with PNG_FILTER_NONE | |
// | |
// Ryzen 5950x | |
// upng (AVX2) = 22.9 msec (11157.3 MB/s), read = 20.5 msec (12499.4 MB/s) | |
// upng = 27.7 msec ( 9254.6 MB/s), read = 20.8 msec (12296.8 MB/s) | |
// libpng (uncompressed) = 169.9 msec ( 1506.9 MB/s), read = 167.5 msec ( 1528.6 MB/s) | |
// libpng (compressed) = 2148.1 msec ( 119.2 MB/s), read = 503.5 msec ( 508.4 MB/s) | |
// | |
// Raspberry Pi4 (-march=armv8+crc) | |
// upng = 182.9 msec (1399.7 MB/s), read = 110.8 msec (2310.8 MB/s) | |
// libpng (uncompressed) = 1192.7 msec ( 214.6 MB/s), read = 1211.8 msec ( 211.3 MB/s) | |
// libpng (compressed) = 9396.8 msec ( 27.2 MB/s), read = 1874.6 msec ( 136.6 MB/s) | |
// | |
// Apple M1 (-march=armv8+crc+crypto) | |
// upng = 22.2 msec (11523.7 MB/s), read = 8.9 msec (28622.5 MB/s) | |
// libpng (uncompressed) = 93.3 msec ( 2743.3 MB/s), read = 66.6 msec ( 3841.8 MB/s) | |
// libpng (compressed) = 2038.6 msec ( 125.6 MB/s), read = 90.4 msec ( 2832.5 MB/s) | |
#include <stddef.h> | |
#include <stdint.h> | |
typedef enum { | |
UPNG_FORMAT_G8, | |
UPNG_FORMAT_GA8, | |
UPNG_FORMAT_RGB8, | |
UPNG_FORMAT_BGR8, | |
UPNG_FORMAT_RGBA8, | |
UPNG_FORMAT_BGRA8, | |
UPNG_FORMAT_G16, | |
UPNG_FORMAT_GA16, | |
UPNG_FORMAT_RGB16, | |
UPNG_FORMAT_BGR16, | |
UPNG_FORMAT_RGBA16, | |
UPNG_FORMAT_BGRA16, | |
} upng_format; | |
typedef enum { | |
UPNG_FILTER_NONE = 0, | |
UPNG_FILTER_UP = 2, | |
} upng_filter; | |
// if `dst` is NULL then function will quickly return size needed for `dst` (`src` won't be used) | |
// if `pitch` is 0, then pixels in `src` are tightly packed without any padding bytes between rows | |
// returns 0 for unsupported parameter values | |
static size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter); | |
// output BGR/BGRA format instead of RGB/RGBA | |
#define UPNG_READ_SWAP_TO_BGR 1 | |
// if `dst` is NULL then function will quickly return `width` / `height` / `format` values from png header | |
// if `pitch` is 0, then pixels in `dst` will be tightly packed without any padding bytes between rows | |
// returns total size of image - `pitch` multiplied by `height` | |
// returns 0 if png file cannot be successfully parsed or is unsupported | |
// function does NOT verify CRC32 or ADLER32 checksums | |
static size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags); | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// optional defines: | |
// UPNG_DISABLE_AVX2 - do not use AVX2 codepath, even if AVX2 is allowed by compiler | |
// UPNG_USE_ARM64_PMUL - prefer to use ARM64 PMUL instruction instead of CRC32 on non-Apple targets | |
// this may be slower than using CRC32 instruction, depends on CPU | |
#if defined(_M_AMD64) || defined(__x86_64__) | |
# define UPNG__ARCH_X64 | |
#elif defined(_M_ARM64) || defined(__aarch64__) | |
# define UPNG__ARCH_ARM64 | |
#endif | |
#if defined(UPNG__ARCH_X64) | |
# if defined(__clang__) || defined(__GNUC__) | |
# include <cpuid.h> | |
# define UPNG__CPUID(num, regs) __cpuid(num, regs[0], regs[1], regs[2], regs[3]) | |
# define UPNG__CPUID2(num, sub, regs) __cpuid_count(num, sub, regs[0], regs[1], regs[2], regs[3]) | |
# define UPNG__TARGET(str) __attribute__((target(str))) | |
# else | |
# include <intrin.h> | |
# define UPNG__CPUID(num, regs) __cpuid(regs, num) | |
# define UPNG__CPUID2(num, sub, regs) __cpuidex(regs, num, sub) | |
# define UPNG__TARGET(str) | |
# endif | |
# if defined(__AVX2__) && !defined(UPNG_DISABLE_AVX2) | |
# define UPNG__ARCH_X64_AVX2 | |
# include <immintrin.h> | |
# if !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER > 1930 && _MSC_VER < 1936) | |
// broken MSVC versions that do not generate VEX encoded VPCLMULQDQ instruction | |
// see https://developercommunity.visualstudio.com/t/_mm_clmulepi64_si128-intrinsic-no-longer/10277103 | |
# pragma message("WARNING: this MSVC compiler version produces very bad performance with AVX2 codegen!") | |
# undef UPNG__ARCH_X64_AVX2 | |
# elif !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER == 1938) | |
// broken MSVC version that generate AVX512 instructions in AVX2 code | |
// see https://developercommunity.visualstudio.com/t/Invalid-AVX512-instructions-generated-wh/10521872 | |
# pragma message("WARNING: this MSVC compiler version produces invalid instructions with AVX2 codegen!") | |
# undef UPNG__ARCH_X64_AVX2 | |
# endif | |
# endif | |
# include <wmmintrin.h> // CLMUL // _mm_clmulepi64_si128 | |
# include <smmintrin.h> // SSSE4.1 // _mm_extract_epi32 | |
# include <tmmintrin.h> // SSSE3 // _mm_maddubs_epi16, _mm_hadd_epi32, _mm_shuffle_epi8 | |
# include <emmintrin.h> // SSE2 | |
#elif defined(UPNG__ARCH_ARM64) | |
# include <arm_neon.h> | |
# if __ARM_FEATURE_CRC32 // use -march=armv8-a+crc when possible | |
# define UPNG__ARM64_CRC32 // __crc32d, __crc32b | |
# include <arm_acle.h> | |
# endif | |
# if __ARM_FEATURE_CRYPTO // use -march=armv8-a+crypto when possible | |
# if defined(__APPLE__) || defined(UPNG_USE_ARM64_PMUL) || !defined(UPNG__ARM64_CRC32) | |
# define UPNG__ARM64_CRYPTO // vmull_p64, vmull_high_p64 | |
# endif | |
# endif | |
#endif | |
#if defined(_MSC_VER) && !defined(__clang__) | |
# include <intrin.h> | |
# define UPNG__ALIGN(n, var) __declspec(align(n)) var | |
# define UPNG__MSVC_BARRIER() _ReadWriteBarrier() | |
# define UPNG__ASSUME_ALIGNED(ptr, align) (ptr) | |
#else | |
# define UPNG__ALIGN(n, var) var __attribute__((aligned(n))) | |
# define UPNG__MSVC_BARRIER() // not need for non-MSVC compiler | |
# define UPNG__ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned(ptr, align) | |
#endif | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
#define UPNG__ADLER32_INIT 1U | |
#define UPNG__ADLER32_MOD 65521 | |
// max amount of bytes to not overflow "b" as uint32_t | |
// max "b" value = 255*n*(n+1)/2 + (n+1)*(65521-1) | |
#define UPNG__ADLER32_CHUNK_SIZE 5552 | |
// max amount of 16-byte blocks to use for SIMD | |
#define UPNG__ADLER32_BLOCKS1 (UPNG__ADLER32_CHUNK_SIZE / 16) | |
#define UPNG__ADLER32_BLOCKS3 (UPNG__ADLER32_CHUNK_SIZE / 48) | |
#define UPNG__ADLER32_BLOCKS4 (UPNG__ADLER32_CHUNK_SIZE / 64) | |
static uint32_t upng__adler32(uint32_t adler, const void* ptr, size_t size) | |
{ | |
const uint8_t* bytes = (const uint8_t*)ptr; | |
uint32_t a = adler & 0xffff; | |
uint32_t b = (adler >> 16); | |
// no SIMD here, it'll be used either for small chunk sizes only or without SIMD | |
while (size >= UPNG__ADLER32_CHUNK_SIZE) | |
{ | |
for (size_t k = 0; k < UPNG__ADLER32_CHUNK_SIZE; k++) | |
{ | |
a += *bytes++; | |
b += a; | |
} | |
size -= UPNG__ADLER32_CHUNK_SIZE; | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
while (size-- != 0) | |
{ | |
a += *bytes++; | |
b += a; | |
} | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
return a | (b << 16); | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
#define UPNG__CRC32_INIT 0U | |
#if !defined(UPNG__CRC32_TABLE_COUNT) | |
# if defined(UPNG__ARM64_CRC32) | |
# define UPNG__CRC32_TABLE_COUNT 0 // no need for CRC32 tables if ACLE crc32 instruction can be used | |
# elif defined(UPNG__ARCH_X64) | |
# define UPNG__CRC32_TABLE_COUNT 16 // for x64 use 16KB table (half of L1 cache) | |
# else | |
# define UPNG__CRC32_TABLE_COUNT 8 // otherwise be safe and use only 8KB, alternatively set to 4 for 4KB table | |
# endif | |
#endif | |
#if UPNG__CRC32_TABLE_COUNT != 0 | |
static uint32_t upng__crc32_table[UPNG__CRC32_TABLE_COUNT][256]; | |
#endif | |
static void upng__crc32_init(void) | |
{ | |
#if UPNG__CRC32_TABLE_COUNT != 0 | |
static int init = 0; | |
if (!init) | |
{ | |
const uint32_t CRC32_POLY = 0xedb88320; | |
for (size_t i = 0; i < 256; i++) | |
{ | |
uint32_t crc = (uint32_t)i; | |
for (size_t j = 0; j < 8; j++) | |
{ | |
crc = (crc >> 1) ^ (crc & 1 ? CRC32_POLY : 0); | |
} | |
upng__crc32_table[0][i] = crc; | |
} | |
for (size_t i = 1; i < UPNG__CRC32_TABLE_COUNT; i++) | |
{ | |
for (size_t j = 0; j < 256; j++) | |
{ | |
upng__crc32_table[i][j] = (upng__crc32_table[i - 1][j] >> 8) ^ upng__crc32_table[0][upng__crc32_table[i - 1][j] & 0xff]; | |
} | |
} | |
init = 1; | |
} | |
#endif | |
} | |
static uint32_t upng__crc32(uint32_t crc, const void* ptr, size_t size) | |
{ | |
const uint8_t* bytes = (const uint8_t*)ptr; | |
crc = ~crc; | |
// no SIMD here, it'll be used either for small chunk sizes only or without SIMD | |
#if defined(UPNG__ARM64_CRC32) | |
while (size-- != 0) | |
{ | |
crc = __crc32b(crc, *bytes++); | |
} | |
#else | |
while ((((uintptr_t)bytes % 4) != 0) && (size != 0)) | |
{ | |
crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++]; | |
size -= 1; | |
} | |
// now bytes pointer is 4-byte aligned | |
const uint32_t* bytes4 = (const uint32_t*)UPNG__ASSUME_ALIGNED(bytes, 4); | |
#if UPNG__CRC32_TABLE_COUNT == 16 | |
while (size >= 16) | |
{ | |
uint32_t b0 = *bytes4++ ^ crc; | |
uint32_t b1 = *bytes4++; | |
uint32_t b2 = *bytes4++; | |
uint32_t b3 = *bytes4++; | |
size -= 16; | |
// these barriers should not affect anything, but they make MSVC(2022) to generate ~25% faster code | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff]; | |
} | |
#elif UPNG__CRC32_TABLE_COUNT == 8 | |
while (size >= 8) | |
{ | |
uint32_t b0 = *bytes4++ ^ crc; | |
uint32_t b1 = *bytes4++; | |
size -= 8; | |
size_t i0 = (b1 >> 24) & 0xff; | |
size_t i1 = (b1 >> 16) & 0xff; | |
size_t i2 = (b1 >> 8) & 0xff; | |
size_t i3 = b1 & 0xff; | |
size_t i4 = (b0 >> 24) & 0xff; | |
size_t i5 = (b0 >> 16) & 0xff; | |
size_t i6 = (b0 >> 8) & 0xff; | |
size_t i7 = b0 & 0xff; | |
// similar situation to 16 table count - this make MSVC(2022) to generate ~25% faster code | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][i0] ^ upng__crc32_table[4][i4]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[1][i1] ^ upng__crc32_table[5][i5]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[2][i2] ^ upng__crc32_table[6][i6]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[3][i3] ^ upng__crc32_table[7][i7]; | |
} | |
#elif UPNG__CRC32_TABLE_COUNT == 4 | |
while (size >= 4) | |
{ | |
uint32_t b0 = *bytes4++ ^ crc; | |
size -= 4; | |
crc = upng__crc32_table[0][(b0 >> 24) & 0xff] ^ upng__crc32_table[1][(b0 >> 16) & 0xff] ^ upng__crc32_table[2][(b0 >> 8) & 0xff] ^ upng__crc32_table[3][b0 & 0xff]; | |
} | |
#endif | |
bytes = (const uint8_t*)bytes4; | |
while (size-- != 0) | |
{ | |
crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++]; | |
} | |
#endif | |
return ~crc; | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
typedef struct { | |
uint32_t crc; // crc32 for whole chunk + 4 byte type | |
uint32_t adler; // adler32 for zlib payload | |
} upng__idat; | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
#if defined(UPNG__ARCH_X64) | |
#define UPNG__CPUID_SSE41 (1<<1) // SSSE3+SSE4.1 | |
#define UPNG__CPUID_CLMUL (1<<2) // SSSE3+SSE4.1+CLMUL | |
static int upng__cpuid(void) | |
{ | |
static int cpuid; | |
if (!cpuid) | |
{ | |
int info[4]; | |
UPNG__CPUID(1, info); | |
int detected = (1 << 0); | |
if (!!(info[3] & (1 << 9))) // SSSE3 bit | |
{ | |
if (!!(info[3] & (1 << 19))) // SSE4.1 bit | |
{ | |
detected |= UPNG__CPUID_SSE41; | |
if (!!(info[3] & (1 << 1))) // CLMUL bit | |
{ | |
detected |= UPNG__CPUID_CLMUL; | |
} | |
} | |
} | |
cpuid = detected; | |
} | |
return cpuid; | |
} | |
static size_t UPNG__TARGET("ssse3,sse4.1") | |
upng__row1_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
uint8_t* out = dst; | |
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
// adler32 | |
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m128i ones = _mm_set1_epi16(1); | |
const __m128i zero = _mm_setzero_si128(); | |
// on input | |
// a0 = a | |
// a = a0 + x[0] | |
// b += a0 + x[0] | |
// ... | |
// a = a0 + x[0] + x[1] | |
// b += 2*a0 + 2*x[0] + 1*x[1] | |
// ... | |
// a = a0 + x[0] + ... + x[N-1] | |
// b += N*a0 + N*x[0] + (N-1)*x[1] + ... + 2*x[N-2] + 1*x[N-1] | |
// processing every 16 bytes in an iteration (5552 is multiple of 16) | |
// va = a0 + (x[0]+...+x[15]) + (x[16]+..+x[31]) + ... | |
// vb = b + (16*x[0]+...+1*x[15]) + (16*x[16]+...+1*x[31]) + ... + (16*X[N-16]+...1*x[N-1]) | |
// vs = n*a0 + (n-1)*(x[0]+...+x[15]) + (n-2)*(x[16]+...+x[31]) + ... + 1*(x[N-32]+...+x[N-17]) + 0*(x[N-16]+...+x[N-1]) | |
// where n = N/16 | |
// vs*16 | |
// N*a0 + (N-16)*x[0]+...+(N-16)*x[15] + (N-32)*x[16]+...+(N-16)*x[31] + ... + 16*x[N-32]+...+16*x[N-17] | |
// vb+vs*16 | |
// N*a0 + N*x[0] + (N-1)*x[1] + ... + 16*x[N-16] + 15*x[N-15] + ... + 1*x[N-1] | |
// for output | |
// a = va | |
// b = vb+vs*16 | |
while (size >= 16) | |
{ | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
// process as many 16-byte blocks as possible | |
size_t block_count = size / 16; | |
block_count = block_count < UPNG__ADLER32_BLOCKS1 ? block_count : UPNG__ADLER32_BLOCKS1; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
__m128i vlast = _mm_loadu_si128((const __m128i*)last); | |
__m128i vsrc = _mm_loadu_si128((const __m128i*)src); | |
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle); | |
_mm_storeu_si128((__m128i*)dst, vdst); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones)); | |
// crc32 update | |
uint32_t b0 = _mm_extract_epi32(vdst, 0) ^ crc; | |
uint32_t b1 = _mm_extract_epi32(vdst, 1); | |
uint32_t b2 = _mm_extract_epi32(vdst, 2); | |
uint32_t b3 = _mm_extract_epi32(vdst, 3); | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff]; | |
} | |
// vb += vs * 16 | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
idat->adler = a | (b << 16); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
#if defined(UPNG__ARCH_X64_AVX2) | |
static size_t UPNG__TARGET("ssse3,sse4.1,avx2,pclmul") | |
upng__row1_avx2(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
if (size < 16) | |
{ | |
return 0; | |
} | |
uint8_t* out = dst; | |
const __m128i shuffle128 = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
// crc32 | |
const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1); | |
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0); | |
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0); | |
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0); | |
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); // low 32 bits | |
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" | |
// https://web.archive.org/web/20230315165408/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | |
// calculates m=(1<<n)%P, which is 32-bit value | |
// returns 33-bit value reflect(m<<32,64) << 1 | |
// | |
// uint64_t crc32_pow2mod(size_t n) | |
// { | |
// uint32_t mod = CRC32_POLY; | |
// for (size_t i = 0; i < n - 32; i++) | |
// { | |
// mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0); | |
// } | |
// // bits are already reflected | |
// return (uint64_t)mod << 1; | |
// } | |
// calculates d=(1<<64)/P, which is 33-bit value (65-32=33) | |
// returns 33-bit value reflect(d,33) | |
// | |
// uint64_t crc32_2pow64div(void) | |
// { | |
// uint64_t div = 1; | |
// uint32_t mod = CRC32_POLY; | |
// for (size_t i = 0; i < 32; i++) | |
// { | |
// div |= (mod&1ULL) << (i+1); | |
// mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0); | |
// } | |
// // bits are already reflected | |
// return div; | |
// } | |
// k1 = crc32_pow2mod(4*128+32) | |
// k2 = crc32_pow2mod(4*128-32 | |
// k3 = crc32_pow2mod(128+32) | |
// k4 = crc32_pow2mod(128-32) | |
// k5 = crc32_pow2mod(64) | |
// P = ((uint64_t)CRC32_POLY << 1) | 1 | |
// u = crc32_2pow64div() | |
// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0 | |
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0); | |
__m128i x0 = _mm_cvtsi32_si128(crc); | |
if (size >= 64) | |
{ | |
const __m256i shuffle256 = _mm256_broadcastsi128_si256(shuffle128); | |
// adler32 | |
const __m256i cmul = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m256i ones = _mm256_set1_epi16(1); | |
const __m256i zero = _mm256_setzero_si256(); | |
// crc32 | |
__m128i x1 = _mm_setzero_si128(); | |
__m128i x2 = _mm_setzero_si128(); | |
__m128i x3 = _mm_setzero_si128(); | |
while (size >= 64) | |
{ | |
__m256i vs = zero; | |
__m256i va = _mm256_zextsi128_si256(_mm_cvtsi32_si128(a)); | |
__m256i vb = _mm256_zextsi128_si256(_mm_cvtsi32_si128(b)); | |
// process as many 64-byte blocks as possible | |
size_t block_count = size / 64; | |
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
__m256i vlast0 = _mm256_loadu_si256((const __m256i*)last + 0); | |
__m256i vlast1 = _mm256_loadu_si256((const __m256i*)last + 1); | |
__m256i vsrc0 = _mm256_loadu_si256((const __m256i*)src + 0); | |
__m256i vsrc1 = _mm256_loadu_si256((const __m256i*)src + 1); | |
__m256i vdst0 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc0, vlast0), shuffle256); | |
__m256i vdst1 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc1, vlast1), shuffle256); | |
_mm256_storeu_si256((__m256i*)dst + 0, vdst0); | |
_mm256_storeu_si256((__m256i*)dst + 1, vdst1); | |
last += inc * 4; | |
src += 64; | |
dst += 64; | |
size -= 64; | |
// adler32 update | |
vs = _mm256_add_epi32(vs, va); | |
va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst0, zero)); | |
vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst0, cmul), ones)); | |
vs = _mm256_add_epi32(vs, va); | |
va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst1, zero)); | |
vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst1, cmul), ones)); | |
// crc32 update | |
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11)); | |
x0 = _mm_xor_si128(x0, _mm256_castsi256_si128(vdst0)); | |
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11)); | |
x1 = _mm_xor_si128(x1, _mm256_extracti128_si256(vdst0, 1)); | |
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11)); | |
x2 = _mm_xor_si128(x2, _mm256_castsi256_si128(vdst1)); | |
x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11)); | |
x3 = _mm_xor_si128(x3, _mm256_extracti128_si256(vdst1, 1)); | |
crc_mul = k1k2; | |
} | |
vb = _mm256_add_epi32(vb, _mm256_slli_epi32(vs, 5)); | |
// a = sum(va) | |
__m128i asum = _mm_add_epi32(_mm256_castsi256_si128(va), _mm256_extracti128_si256(va, 1)); | |
asum = _mm_hadd_epi32(asum, asum); | |
asum = _mm_hadd_epi32(asum, asum); | |
a = _mm_cvtsi128_si32(asum); | |
// b = sum(vb) | |
__m128i bsum = _mm_add_epi32(_mm256_castsi256_si128(vb), _mm256_extracti128_si256(vb, 1)); | |
bsum = _mm_hadd_epi32(bsum, bsum); | |
bsum = _mm_hadd_epi32(bsum, bsum); | |
b = _mm_cvtsi128_si32(bsum); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
// reduce 512-bit to 128-bit | |
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
crc_mul = k3k4; | |
} | |
if (size >= 16) | |
{ | |
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m128i ones = _mm_set1_epi16(1); | |
const __m128i zero = _mm_setzero_si128(); | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
// only 1 to 3 iterations | |
while (size >= 16) | |
{ | |
__m128i vlast = _mm_loadu_si128((const __m128i*)last); | |
__m128i vsrc = _mm_loadu_si128((const __m128i*)src); | |
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle128); | |
_mm_storeu_si128((__m128i*)dst, vdst); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones)); | |
// crc32 update | |
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11)); | |
x0 = _mm_xor_si128(x0, vdst); | |
crc_mul = k3k4; | |
} | |
// vb += vs * 16 | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
idat->adler = a | (b << 16); | |
// reduce 128-bit to 96-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10)); | |
// reduce 96-bit to 64-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00)); | |
// reduce 64-bit to 32-bit | |
__m128i x1; | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10); | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00); | |
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
#else // UPNG__ARCH_X64_AVX2 | |
static size_t UPNG__TARGET("ssse3,sse4.1,pclmul") | |
upng__row1_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
if (size < 16) | |
{ | |
return 0; | |
} | |
uint8_t* out = dst; | |
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
// adler32 | |
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m128i ones = _mm_set1_epi16(1); | |
const __m128i zero = _mm_setzero_si128(); | |
// crc32 | |
const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1); | |
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0); | |
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0); | |
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0); | |
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); | |
// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0 | |
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0); | |
__m128i x0 = _mm_cvtsi32_si128(crc); | |
if (size >= 64) | |
{ | |
__m128i x1 = zero; | |
__m128i x2 = zero; | |
__m128i x3 = zero; | |
while (size >= 64) | |
{ | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
// process as many 64-byte blocks as possible | |
size_t block_count = size / 64; | |
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0); | |
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1); | |
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2); | |
__m128i vlast3 = _mm_loadu_si128((const __m128i*)last + 3); | |
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0); | |
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1); | |
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2); | |
__m128i vsrc3 = _mm_loadu_si128((const __m128i*)src + 3); | |
__m128i vdst0 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc0, vlast0), shuffle); | |
__m128i vdst1 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc1, vlast1), shuffle); | |
__m128i vdst2 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc2, vlast2), shuffle); | |
__m128i vdst3 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc3, vlast3), shuffle); | |
_mm_storeu_si128((__m128i*)dst + 0, vdst0); | |
_mm_storeu_si128((__m128i*)dst + 1, vdst1); | |
_mm_storeu_si128((__m128i*)dst + 2, vdst2); | |
_mm_storeu_si128((__m128i*)dst + 3, vdst3); | |
last += inc * 4; | |
src += 64; | |
dst += 64; | |
size -= 64; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst3, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst3, cmul), ones)); | |
// crc32 update | |
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11)); | |
x0 = _mm_xor_si128(x0, vdst0); | |
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11)); | |
x1 = _mm_xor_si128(x1, vdst1); | |
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11)); | |
x2 = _mm_xor_si128(x2, vdst2); | |
x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11)); | |
x3 = _mm_xor_si128(x3, vdst3); | |
crc_mul = k1k2; | |
} | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
// reduce 512-bit to 128-bit | |
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
crc_mul = k3k4; | |
} | |
if (size >= 16) | |
{ | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
// only 1 to 3 iterations | |
while (size >= 16) | |
{ | |
__m128i vlast = _mm_loadu_si128((const __m128i*)last); | |
__m128i vsrc = _mm_loadu_si128((const __m128i*)src); | |
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle); | |
_mm_storeu_si128((__m128i*)dst, vdst); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones)); | |
// crc32 update | |
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11)); | |
x0 = _mm_xor_si128(x0, vdst); | |
crc_mul = k3k4; | |
} | |
// vb += vs * 16 | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
idat->adler = a | (b << 16); | |
// reduce 128-bit to 96-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10)); | |
// reduce 96-bit to 64-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00)); | |
// reduce 64-bit to 32-bit | |
__m128i x1; | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10); | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00); | |
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
#endif // UPNG__ARCH_X64_AVX2 | |
static size_t UPNG__TARGET("ssse3,sse4.1") | |
upng__row3_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16]) | |
{ | |
uint8_t* out = dst; | |
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0])); | |
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1])); | |
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2])); | |
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3])); | |
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4])); | |
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5])); | |
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6])); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
// adler32 | |
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m128i ones = _mm_set1_epi16(1); | |
const __m128i zero = _mm_setzero_si128(); | |
while (size >= 48) | |
{ | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
size_t block_count = size / 48; | |
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0); | |
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1); | |
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2); | |
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0); | |
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1); | |
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2); | |
__m128i v0 = _mm_sub_epi8(vsrc0, vlast0); | |
__m128i v1 = _mm_sub_epi8(vsrc1, vlast1); | |
__m128i v2 = _mm_sub_epi8(vsrc2, vlast2); | |
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01)); | |
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12)); | |
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22)); | |
_mm_storeu_si128((__m128i*)dst + 0, vdst0); | |
_mm_storeu_si128((__m128i*)dst + 1, vdst1); | |
_mm_storeu_si128((__m128i*)dst + 2, vdst2); | |
last += inc; | |
src += 48; | |
dst += 48; | |
size -= 48; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones)); | |
// crc32 update | |
uint32_t b0 = _mm_extract_epi32(vdst0, 0) ^ crc; | |
uint32_t b1 = _mm_extract_epi32(vdst0, 1); | |
uint32_t b2 = _mm_extract_epi32(vdst0, 2); | |
uint32_t b3 = _mm_extract_epi32(vdst0, 3); | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff]; | |
b0 = _mm_extract_epi32(vdst1, 0) ^ crc; | |
b1 = _mm_extract_epi32(vdst1, 1); | |
b2 = _mm_extract_epi32(vdst1, 2); | |
b3 = _mm_extract_epi32(vdst1, 3); | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff]; | |
b0 = _mm_extract_epi32(vdst2, 0) ^ crc; | |
b1 = _mm_extract_epi32(vdst2, 1); | |
b2 = _mm_extract_epi32(vdst2, 2); | |
b3 = _mm_extract_epi32(vdst2, 3); | |
UPNG__MSVC_BARRIER(); | |
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff]; | |
UPNG__MSVC_BARRIER(); | |
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff]; | |
} | |
// vb += vs * 16 | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
idat->adler = a | (b << 16); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
static size_t UPNG__TARGET("ssse3,sse4.1,pclmul") | |
upng__row3_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t (*shuffle)[16]) | |
{ | |
if (size < 48) | |
{ | |
return 0; | |
} | |
uint8_t* out = dst; | |
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0])); | |
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1])); | |
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2])); | |
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3])); | |
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4])); | |
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5])); | |
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6])); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
// adler32 | |
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); | |
const __m128i ones = _mm_set1_epi16(1); | |
const __m128i zero = _mm_setzero_si128(); | |
// crc32 | |
// k1 = crc32_pow2mod(3*128+32) | |
// k2 = crc32_pow2mod(3*128-32) | |
const __m128i k1k2 = _mm_setr_epi32(0x3db1ecdc, 0, 0x74359406, 1); | |
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0); | |
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0); | |
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0); | |
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); | |
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0); | |
__m128i x0 = _mm_cvtsi32_si128(crc); | |
__m128i x1 = zero; | |
__m128i x2 = zero; | |
while (size >= 48) | |
{ | |
__m128i vs = zero; | |
__m128i va = _mm_cvtsi32_si128(a); | |
__m128i vb = _mm_cvtsi32_si128(b); | |
// process as many 3x16-byte blocks as possible | |
size_t block_count = size / 48; | |
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0); | |
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1); | |
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2); | |
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0); | |
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1); | |
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2); | |
__m128i v0 = _mm_sub_epi8(vsrc0, vlast0); | |
__m128i v1 = _mm_sub_epi8(vsrc1, vlast1); | |
__m128i v2 = _mm_sub_epi8(vsrc2, vlast2); | |
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01)); | |
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12)); | |
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22)); | |
_mm_storeu_si128((__m128i*)dst + 0, vdst0); | |
_mm_storeu_si128((__m128i*)dst + 1, vdst1); | |
_mm_storeu_si128((__m128i*)dst + 2, vdst2); | |
last += inc; | |
src += 48; | |
dst += 48; | |
size -= 48; | |
// adler32 update | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones)); | |
vs = _mm_add_epi32(vs, va); | |
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero)); | |
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones)); | |
// crc32 update | |
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11)); | |
x0 = _mm_xor_si128(x0, vdst0); | |
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11)); | |
x1 = _mm_xor_si128(x1, vdst1); | |
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11)); | |
x2 = _mm_xor_si128(x2, vdst2); | |
crc_mul = k1k2; | |
} | |
// vb += vs * 16 | |
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4)); | |
// a = sum(va) | |
va = _mm_hadd_epi32(va, va); | |
va = _mm_hadd_epi32(va, va); | |
a = _mm_cvtsi128_si32(va); | |
// b = sum(vb) | |
vb = _mm_hadd_epi32(vb, vb); | |
vb = _mm_hadd_epi32(vb, vb); | |
b = _mm_cvtsi128_si32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
idat->adler = a | (b << 16); | |
// reduce 384-bit to 128-bit | |
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11))); | |
// reduce 128-bit to 96-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10)); | |
// reduce 96-bit to 64-bit | |
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00)); | |
// reduce 64-bit to 32-bit | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10); | |
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00); | |
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
static size_t UPNG__TARGET("ssse3,sse4.1") | |
upng__unrow1_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
uint8_t* out = dst; | |
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64); | |
while (size >= 16) | |
{ | |
__m128i vlast = _mm_loadu_si128((const __m128i*)last); | |
__m128i vsrc = _mm_loadu_si128((const __m128i*)src); | |
__m128i vdst = _mm_shuffle_epi8(vsrc, shuffle); | |
_mm_storeu_si128((__m128i*)dst, _mm_add_epi8(vdst, vlast)); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
} | |
return dst - out; | |
} | |
static size_t UPNG__TARGET("ssse3,sse4.1") | |
upng__unrow3_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16]) | |
{ | |
uint8_t* out = dst; | |
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0])); | |
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1])); | |
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2])); | |
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3])); | |
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4])); | |
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5])); | |
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6])); | |
while (size >= 48) | |
{ | |
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0); | |
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1); | |
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2); | |
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0); | |
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1); | |
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2); | |
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(vsrc0, s00), _mm_shuffle_epi8(vsrc1, s01)); | |
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vsrc0, s10), _mm_shuffle_epi8(vsrc1, s11)), _mm_shuffle_epi8(vsrc2, s12)); | |
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(vsrc1, s21), _mm_shuffle_epi8(vsrc2, s22)); | |
_mm_storeu_si128((__m128i*)dst + 0, _mm_add_epi8(vdst0, vlast0)); | |
_mm_storeu_si128((__m128i*)dst + 1, _mm_add_epi8(vdst1, vlast1)); | |
_mm_storeu_si128((__m128i*)dst + 2, _mm_add_epi8(vdst2, vlast2)); | |
last += inc; | |
src += 48; | |
dst += 48; | |
size -= 48; | |
} | |
return dst - out; | |
} | |
#endif | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
#if defined(UPNG__ARCH_ARM64) | |
static size_t upng__row1_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
if (size < 16) | |
{ | |
return 0; | |
} | |
uint8_t* out = dst; | |
const uint64_t shuffle64_high = shuffle64 + 0x0808080808080808; | |
const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high))); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708)); | |
const uint32x4_t zero = vdupq_n_u32(0); | |
#if defined(UPNG__ARM64_CRYPTO) | |
const poly64x2_t k1k2 = { 0x154442bd4, 0x1c6e41596 }; | |
const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e }; | |
const poly64_t k5 = { 0x163cd6124 }; | |
const poly64_t poly_u = { 0x0f7011641 }; | |
const poly64_t poly_p = { 0x1db710641 }; | |
const uint64x2_t mask32 = { ~0U, 0 }; | |
poly64x2_t crc_mul = { 1, 0 }; | |
poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0))); | |
#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b))) | |
#define UPNG__CLMUL_P128(x,k,value) do { \ | |
poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \ | |
poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k); \ | |
x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value)); \ | |
} while (0) | |
#endif | |
if (size >= 64) | |
{ | |
#if defined(UPNG__ARM64_CRYPTO) | |
poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0))); | |
poly128_t x2 = x1; | |
poly128_t x3 = x1; | |
#endif | |
while (size >= 64) | |
{ | |
uint32x4_t va = vsetq_lane_u32(a, zero, 0); | |
uint32x4_t vb = vsetq_lane_u32(b, zero, 0); | |
uint32x4_t vs = zero; | |
// process as many 64-byte blocks as possible | |
size_t block_count = size / 64; | |
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
// pixel filtering | |
uint8x16x4_t vlast = vld1q_u8_x4(last); | |
uint8x16x4_t vsrc = vld1q_u8_x4(src); | |
uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]); | |
uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]); | |
uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]); | |
uint8x16_t v3 = vsubq_u8(vsrc.val[3], vlast.val[3]); | |
uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle); | |
uint8x16_t vdst1 = vqtbl1q_u8(v1, shuffle); | |
uint8x16_t vdst2 = vqtbl1q_u8(v2, shuffle); | |
uint8x16_t vdst3 = vqtbl1q_u8(v3, shuffle); | |
uint8x16x4_t vdst = { vdst0, vdst1, vdst2, vdst3 }; | |
vst1q_u8_x4(dst, vdst); | |
last += inc * 4; | |
src += 64; | |
dst += 64; | |
size -= 64; | |
// these could use vdotq_u32, but it runs ~2% slower | |
uint16x8_t t0, t1, t2, t3; | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst0)); | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst1)); | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst2)); | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst3)); | |
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul)); | |
t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul)); | |
t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul)); | |
t3 = vmull_u8(vget_low_u8(vdst3), vget_low_u8(cmul)); | |
t0 = vmlal_high_u8(t0, vdst0, cmul); | |
t1 = vmlal_high_u8(t1, vdst1, cmul); | |
t2 = vmlal_high_u8(t2, vdst2, cmul); | |
t3 = vmlal_high_u8(t3, vdst3, cmul); | |
vb = vpadalq_u16(vb, t0); | |
vb = vpadalq_u16(vb, t1); | |
vb = vpadalq_u16(vb, t2); | |
vb = vpadalq_u16(vb, t3); | |
#if defined(UPNG__ARM64_CRYPTO) | |
UPNG__CLMUL_P128(x0, crc_mul, vdst0); | |
UPNG__CLMUL_P128(x1, crc_mul, vdst1); | |
UPNG__CLMUL_P128(x2, crc_mul, vdst2); | |
UPNG__CLMUL_P128(x3, crc_mul, vdst3); | |
crc_mul = k1k2; | |
#elif defined(UPNG__ARM64_CRC32) | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 1)); | |
#else | |
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc; | |
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 0) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
#endif | |
} | |
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4)); | |
a = vaddvq_u32(va); | |
b = vaddvq_u32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
#if defined(UPNG__ARM64_CRYPTO) | |
// reduce 512-bit to 128-bit | |
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1)); | |
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2)); | |
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x3)); | |
crc_mul = k3k4; | |
#endif | |
} | |
if (size >= 16) | |
{ | |
uint32x4_t va = vsetq_lane_u32(a, zero, 0); | |
uint32x4_t vb = vsetq_lane_u32(b, zero, 0); | |
uint32x4_t vs = zero; | |
// only 1 to 3 iterations | |
while (size >= 16) | |
{ | |
uint8x16_t v0 = vsubq_u8(vld1q_u8(src), vld1q_u8(last)); | |
uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle); | |
vst1q_u8(dst, vdst0); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
uint16x8_t t0; | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst0)); | |
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul)); | |
t0 = vmlal_high_u8(t0, vdst0, cmul); | |
vb = vpadalq_u16(vb, t0); | |
#if defined(UPNG__ARM64_CRYPTO) | |
UPNG__CLMUL_P128(x0, crc_mul, vdst0); | |
crc_mul = k3k4; | |
#elif defined(UPNG__ARM64_CRC32) | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1)); | |
#else | |
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc; | |
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
#endif | |
} | |
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4)); | |
a = vaddvq_u32(va); | |
b = vaddvq_u32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
#if defined(UPNG__ARM64_CRYPTO) | |
// reduce 128-bit to 96-bit | |
poly128_t p0; | |
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8)); | |
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1))); | |
// reduce 96-bit to 64-bit | |
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4)); | |
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5)); | |
// reduce 64-bit to 32-bit | |
poly128_t x1; | |
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u); | |
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p); | |
crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1); | |
#undef UPNG__CLADD_P128 | |
#undef UPNG__CLMUL_P128 | |
#endif | |
idat->adler = a | (b << 16); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
static size_t upng__row3_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16]) | |
{ | |
if (size < 48) | |
{ | |
return 0; | |
} | |
uint8_t* out = dst; | |
const uint8x16_t s00 = vld1q_u8(shuffle[0]); | |
const uint8x16_t s01 = vld1q_u8(shuffle[1]); | |
const uint8x16_t s10 = vld1q_u8(shuffle[2]); | |
const uint8x16_t s11 = vld1q_u8(shuffle[3]); | |
const uint8x16_t s12 = vld1q_u8(shuffle[4]); | |
const uint8x16_t s21 = vld1q_u8(shuffle[5]); | |
const uint8x16_t s22 = vld1q_u8(shuffle[6]); | |
uint32_t a = idat->adler & 0xffff; | |
uint32_t b = idat->adler >> 16; | |
uint32_t crc = ~idat->crc; | |
const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708)); | |
const uint32x4_t zero = vdupq_n_u32(0); | |
#if defined(UPNG__ARM64_CRYPTO) | |
const poly64x2_t k1k2 = { 0x03db1ecdc, 0x174359406 }; | |
const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e }; | |
const poly64_t k5 = { 0x163cd6124 }; | |
const poly64_t poly_u = { 0x0f7011641 }; | |
const poly64_t poly_p = { 0x1db710641 }; | |
const uint64x2_t mask32 = { ~0U, 0 }; | |
poly64x2_t crc_mul = { 1, 0 }; | |
poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0))); | |
poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0))); | |
poly128_t x2 = x1; | |
#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b))) | |
#define UPNG__CLMUL_P128(x,k,value) do { \ | |
poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \ | |
poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k); \ | |
x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value)); \ | |
} while (0) | |
#endif | |
while (size >= 48) | |
{ | |
uint32x4_t va = vsetq_lane_u32(a, zero, 0); | |
uint32x4_t vb = vsetq_lane_u32(b, zero, 0); | |
uint32x4_t vs = zero; | |
// process as many 3x16-byte blocks as possible | |
size_t block_count = size / 48; | |
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3; | |
for (size_t i = 0; i < block_count; i++) | |
{ | |
uint8x16x3_t vlast = vld1q_u8_x3(last); | |
uint8x16x3_t vsrc = vld1q_u8_x3(src); | |
uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]); | |
uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]); | |
uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]); | |
uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(v0, s00), v1, s01); | |
uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(v0, s10), v1, s11), v2, s12); | |
uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(v1, s21), v2, s22); | |
uint8x16x3_t vdst = { vdst0, vdst1, vdst2 }; | |
vst1q_u8_x3(dst, vdst); | |
last += inc; | |
src += 48; | |
dst += 48; | |
size -= 48; | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst0)); | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst1)); | |
vs = vaddq_u32(vs, va); | |
va = vpadalq_u16(va, vpaddlq_u8(vdst2)); | |
uint16x8_t t0, t1, t2; | |
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul)); | |
t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul)); | |
t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul)); | |
t0 = vmlal_high_u8(t0, vdst0, cmul); | |
t1 = vmlal_high_u8(t1, vdst1, cmul); | |
t2 = vmlal_high_u8(t2, vdst2, cmul); | |
vb = vpadalq_u16(vb, t0); | |
vb = vpadalq_u16(vb, t1); | |
vb = vpadalq_u16(vb, t2); | |
#if defined(UPNG__ARM64_CRYPTO) | |
UPNG__CLMUL_P128(x0, crc_mul, vdst0); | |
UPNG__CLMUL_P128(x1, crc_mul, vdst1); | |
UPNG__CLMUL_P128(x2, crc_mul, vdst2); | |
crc_mul = k1k2; | |
#elif defined(UPNG__ARM64_CRC32) | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0)); | |
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1)); | |
#else | |
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc; | |
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc; | |
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3); | |
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff]; | |
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff]; | |
#endif | |
} | |
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4)); | |
a = vaddvq_u32(va); | |
b = vaddvq_u32(vb); | |
a %= UPNG__ADLER32_MOD; | |
b %= UPNG__ADLER32_MOD; | |
} | |
#if defined(UPNG__ARM64_CRYPTO) | |
// reduce 384-bit to 128-bit | |
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1)); | |
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2)); | |
// reduce 128-bit to 96-bit | |
poly128_t p0; | |
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8)); | |
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1))); | |
// reduce 96-bit to 64-bit | |
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4)); | |
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5)); | |
// reduce 64-bit to 32-bit | |
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u); | |
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p); | |
crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1); | |
#undef UPNG__CLMUL_P128 | |
#undef UPNG__CLADD_P128 | |
#endif | |
idat->adler = a | (b << 16); | |
idat->crc = ~crc; | |
return dst - out; | |
} | |
static size_t upng__unrow1_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64) | |
{ | |
uint8_t* out = dst; | |
uint64_t shuffle64_high = shuffle64 + 0x0808080808080808; | |
const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high))); | |
while (size >= 16) | |
{ | |
uint8x16_t vdst0 = vqtbl1q_u8(vld1q_u8(src), shuffle); | |
vdst0 = vaddq_u8(vdst0, vld1q_u8(last)); | |
vst1q_u8(dst, vdst0); | |
last += inc; | |
src += 16; | |
dst += 16; | |
size -= 16; | |
} | |
return dst - out; | |
} | |
static size_t upng__unrow3_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16]) | |
{ | |
uint8_t* out = dst; | |
const uint8x16_t s00 = vld1q_u8(shuffle[0]); | |
const uint8x16_t s01 = vld1q_u8(shuffle[1]); | |
const uint8x16_t s10 = vld1q_u8(shuffle[2]); | |
const uint8x16_t s11 = vld1q_u8(shuffle[3]); | |
const uint8x16_t s12 = vld1q_u8(shuffle[4]); | |
const uint8x16_t s21 = vld1q_u8(shuffle[5]); | |
const uint8x16_t s22 = vld1q_u8(shuffle[6]); | |
while (size >= 48) | |
{ | |
uint8x16x3_t vsrc = vld1q_u8_x3(src); | |
uint8x16_t vsrc0 = vsrc.val[0]; | |
uint8x16_t vsrc1 = vsrc.val[1]; | |
uint8x16_t vsrc2 = vsrc.val[2]; | |
uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(vsrc0, s00), vsrc1, s01); | |
uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(vsrc0, s10), vsrc1, s11), vsrc2, s12); | |
uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(vsrc1, s21), vsrc2, s22); | |
uint8x16x3_t vlast = vld1q_u8_x3(last); | |
vdst0 = vaddq_u8(vdst0, vlast.val[0]); | |
vdst1 = vaddq_u8(vdst1, vlast.val[1]); | |
vdst2 = vaddq_u8(vdst2, vlast.val[2]); | |
uint8x16x3_t vdst = { vdst0, vdst1, vdst2 }; | |
vst1q_u8_x3(dst, vdst); | |
last += inc; | |
src += 48; | |
dst += 48; | |
size -= 48; | |
} | |
return dst - out; | |
} | |
#endif | |
#define _ 0xff | |
#define __ _ | |
// identity shuffle, nothing to rearrange | |
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB8[7][16]) = | |
{ | |
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15}, | |
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __}, | |
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __}, | |
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15}, | |
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __}, | |
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __}, | |
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15}, | |
}; | |
// 0123456789012345 | |
// src0 = [BGRBGRBGRBGRBGRB] | |
// src1 = [GRBGRBGRBGRBGRBG] | |
// src2 = [RBGRBGRBGRBGRBGR] | |
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR8[7][16]) = | |
{ | |
{ 2,1,0, 5,4,3, 8,7,6, 11,10,9, 14,13,12, _}, // RGB RGB RGB RGB RGB _ | |
{ _,_,_, _,_,_, _,_,_, __,__,_, __,__,__, 1}, // ___ ___ ___ ___ ___ R | |
{ _,15, _,_,_, _,_,_, __,_,_, __,__,__, _,__}, // _B ___ ___ ___ ___ __ | |
{ 0,__, 4,3,2, 7,6,5, 10,9,8, 13,12,11, _,15}, // G_ RGB RGB RGB RGB _G | |
{ _,__, _,_,_, _,_,_, __,_,_, __,__,__, 0,__}, // __ ___ ___ ___ ___ R_ | |
{14, _,_,_, _,_,_, _,_,_, __,__,__, __,__,__}, // B ___ ___ ___ ___ ___ | |
{__, 3,2,1, 6,5,4, 9,8,7, 12,11,10, 15,14,13}, // _ RGB RGB RGB RGB RGB | |
}; | |
// only swap bytes in each 16-bit value | |
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB16[7][16]) = | |
{ | |
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14}, | |
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__}, | |
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__}, | |
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14}, | |
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__}, | |
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__}, | |
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14}, | |
}; | |
// 0123456789012345 | |
// src0 = [BBGGRRBBGGRRBBGG] | |
// src1 = [RRBBGGRRBBGGRRBB] | |
// src2 = [GGRRBBGGRRBBGGRR] | |
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR16[7][16]) = | |
{ | |
{5,4,3,2,1,0, 11,10,9,8,7,6, _,_,15,14}, // RRGGBB RRGGBB __GG | |
{_,_,_,_,_,_, __,__,_,_,_,_, 1,0,__,__}, // ______ ______ RR__ | |
{13,12, _,_,_,_,_,_, __,__,__,__,_,_, _,_}, // BB ______ ______ __ | |
{__,__, 7,6,5,4,3,2, 13,12,11,10,9,8, _,_}, // __ RRGGBB RRGGBB __ | |
{__,__, _,_,_,_,_,_, __,__,__,__,_,_, 3,2}, // __ ______ ______ RR | |
{_,_, 15,14, _,_,_,_,_,_, __,__,__,__,__,__}, // __BB ______ ______ | |
{1,0, __,__, 9,8,7,6,5,4, 15,14,13,12,11,10}, // GG__ RRGGBB RRGGBB | |
}; | |
#undef _ | |
#undef __ | |
// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16 | |
static size_t upng__row1(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format) | |
{ | |
uint64_t shuffle64; | |
switch (format) | |
{ | |
case UPNG_FORMAT_G8: | |
case UPNG_FORMAT_GA8: | |
case UPNG_FORMAT_RGBA8: | |
shuffle64 = 0x0706050403020100; // nothing to shuffle, identity | |
break; | |
case UPNG_FORMAT_BGRA8: | |
shuffle64 = 0x0704050603000102; // BGRA to RGBA | |
break; | |
case UPNG_FORMAT_G16: | |
case UPNG_FORMAT_GA16: | |
case UPNG_FORMAT_RGBA16: | |
shuffle64 = 0x0607040502030001; // swap bytes | |
break; | |
case UPNG_FORMAT_BGRA16: | |
shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA | |
break; | |
default: | |
shuffle64 = 0; | |
break; | |
}; | |
#if defined(UPNG__ARCH_X64_AVX2) | |
int cpuid = upng__cpuid(); | |
if (cpuid & UPNG__CPUID_CLMUL) | |
{ | |
return upng__row1_avx2(idat, dst, src, last, size, inc, shuffle64); | |
} | |
else | |
{ | |
return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64); | |
} | |
#elif defined(UPNG__ARCH_X64) | |
int cpuid = upng__cpuid(); | |
if (cpuid & UPNG__CPUID_CLMUL) | |
{ | |
return upng__row1_clmul(idat, dst, src, last, size, inc, shuffle64); | |
} | |
else if (cpuid & UPNG__CPUID_SSE41) | |
{ | |
return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64); | |
} | |
#elif defined(UPNG__ARCH_ARM64) | |
return upng__row1_arm64(idat, dst, src, last, size, inc, shuffle64); | |
#else | |
(void)shuffle64; | |
#endif | |
return 0; | |
} | |
// handles RGB8, BGR8, RGB16, BGR16 | |
static size_t upng__row3(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format) | |
{ | |
const uint8_t (*shuffle)[16]; | |
switch (format) | |
{ | |
case UPNG_FORMAT_RGB8: | |
shuffle = upng__shuffle_RGB8; | |
break; | |
case UPNG_FORMAT_BGR8: | |
shuffle = upng__shuffle_BGR8; | |
break; | |
case UPNG_FORMAT_RGB16: | |
shuffle = upng__shuffle_RGB16; | |
break; | |
case UPNG_FORMAT_BGR16: | |
shuffle = upng__shuffle_BGR16; | |
break; | |
default: | |
shuffle = NULL; | |
break; | |
} | |
#if defined(UPNG__ARCH_X64) | |
int cpuid = upng__cpuid(); | |
if (cpuid & UPNG__CPUID_CLMUL) | |
{ | |
return upng__row3_clmul(idat, dst, src, last, size, inc, shuffle); | |
} | |
else if (cpuid & UPNG__CPUID_SSE41) | |
{ | |
return upng__row3_sse4(idat, dst, src, last, size, inc, shuffle); | |
} | |
#elif defined(UPNG__ARCH_ARM64) | |
return upng__row3_arm64(idat, dst, src, last, size, inc, shuffle); | |
#else | |
(void)shuffle; | |
#endif | |
return 0; | |
} | |
static void upng__row(upng__idat* idat, uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter) | |
{ | |
// NONE filter is same as UP when previous row is all 0 values | |
static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 }; | |
const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : src - pitch; | |
size_t inc; | |
size_t used; | |
if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16) | |
{ | |
inc = filter == UPNG_FILTER_NONE ? 0 : 48; | |
used = upng__row3(idat, dst, src, last, size, inc, format); | |
} | |
else | |
{ | |
inc = filter == UPNG_FILTER_NONE ? 0 : 16; | |
used = upng__row1(idat, dst, src, last, size, inc, format); | |
} | |
last += inc == 0 ? 0 : used; | |
src += used; | |
dst += used; | |
size -= used; | |
uint8_t* tail = dst; | |
size_t tail_size = size; | |
// size < inc | |
switch (format) | |
{ | |
case UPNG_FORMAT_G8: // 16 pixels per 16 bytes | |
inc /= 16; // 0 or 1 | |
while (size != 0) | |
{ | |
dst[0] = src[0] - last[0]; | |
last += inc; | |
src += 1; | |
dst += 1; | |
size -= 1; | |
} | |
break; | |
case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes | |
inc /= 8; // 0 or 2 | |
while (size != 0) | |
{ | |
dst[0] = src[0] - last[0]; | |
dst[1] = src[1] - last[1]; | |
last += inc; | |
src += 2; | |
dst += 2; | |
size -= 2; | |
} | |
break; | |
case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes | |
inc /= 16; // 0 or 3 | |
while (size != 0) | |
{ | |
dst[0] = src[0] - last[0]; | |
dst[1] = src[1] - last[1]; | |
dst[2] = src[2] - last[2]; | |
last += inc; | |
src += 3; | |
dst += 3; | |
size -= 3; | |
} | |
break; | |
case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes | |
inc /= 16; // 0 or 3 | |
while (size != 0) | |
{ | |
dst[0] = src[2] - last[2]; | |
dst[1] = src[1] - last[1]; | |
dst[2] = src[0] - last[0]; | |
last += inc; | |
src += 3; | |
dst += 3; | |
size -= 3; | |
} | |
break; | |
case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[0] - last[0]; | |
dst[1] = src[1] - last[1]; | |
dst[2] = src[2] - last[2]; | |
dst[3] = src[3] - last[3]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[2] - last[2]; | |
dst[1] = src[1] - last[1]; | |
dst[2] = src[0] - last[0]; | |
dst[3] = src[3] - last[3]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_G16: // 8 pixels per 16 bytes | |
inc /= 8; // 0 or 2 | |
while (size != 0) | |
{ | |
dst[0] = src[1] - last[1]; | |
dst[1] = src[0] - last[0]; | |
last += inc; | |
src += 2; | |
dst += 2; | |
size -= 2; | |
} | |
break; | |
case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[1] - last[1]; | |
dst[1] = src[0] - last[0]; | |
dst[2] = src[3] - last[3]; | |
dst[3] = src[2] - last[2]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes | |
inc /= 8; // 0 or 6 | |
while (size != 0) | |
{ | |
dst[0] = src[1] - last[1]; | |
dst[1] = src[0] - last[0]; | |
dst[2] = src[3] - last[3]; | |
dst[3] = src[2] - last[2]; | |
dst[4] = src[5] - last[5]; | |
dst[5] = src[4] - last[4]; | |
last += inc; | |
src += 6; | |
dst += 6; | |
size -= 6; | |
} | |
break; | |
case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes | |
inc /= 8; // 0 or 6 | |
while (size != 0) | |
{ | |
dst[0] = src[5] - last[5]; | |
dst[1] = src[4] - last[4]; | |
dst[2] = src[3] - last[3]; | |
dst[3] = src[2] - last[2]; | |
dst[4] = src[1] - last[1]; | |
dst[5] = src[0] - last[0]; | |
last += inc; | |
src += 6; | |
dst += 6; | |
size -= 6; | |
} | |
break; | |
case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes | |
inc /= 2; // 0 or 8 | |
while (size != 0) | |
{ | |
dst[0] = src[1] - last[1]; | |
dst[1] = src[0] - last[0]; | |
dst[2] = src[3] - last[3]; | |
dst[3] = src[2] - last[2]; | |
dst[4] = src[5] - last[5]; | |
dst[5] = src[4] - last[4]; | |
dst[6] = src[7] - last[7]; | |
dst[7] = src[6] - last[6]; | |
last += inc; | |
src += 8; | |
dst += 8; | |
size -= 8; | |
} | |
break; | |
case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes | |
inc /= 2; // 0 or 8 | |
while (size != 0) | |
{ | |
dst[0] = src[5] - last[5]; | |
dst[1] = src[4] - last[4]; | |
dst[2] = src[3] - last[3]; | |
dst[3] = src[2] - last[2]; | |
dst[4] = src[1] - last[1]; | |
dst[5] = src[0] - last[0]; | |
dst[6] = src[7] - last[7]; | |
dst[7] = src[6] - last[6]; | |
last += inc; | |
src += 8; | |
dst += 8; | |
size -= 8; | |
} | |
break; | |
} | |
idat->adler = upng__adler32(idat->adler, tail, tail_size); | |
idat->crc = upng__crc32(idat->crc, tail, tail_size); | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16 | |
static size_t upng__unrow1(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format) | |
{ | |
uint64_t shuffle64; | |
switch (format) | |
{ | |
case UPNG_FORMAT_G8: | |
case UPNG_FORMAT_GA8: | |
case UPNG_FORMAT_RGBA8: | |
shuffle64 = 0x0706050403020100; // nothing to shuffle, identity | |
break; | |
case UPNG_FORMAT_BGRA8: | |
shuffle64 = 0x0704050603000102; // BGRA to RGBA | |
break; | |
case UPNG_FORMAT_G16: | |
case UPNG_FORMAT_GA16: | |
case UPNG_FORMAT_RGBA16: | |
shuffle64 = 0x0607040502030001; // swap bytes | |
break; | |
case UPNG_FORMAT_BGRA16: | |
shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA | |
break; | |
default: | |
shuffle64 = 0; | |
break; | |
}; | |
#if defined(UPNG__ARCH_X64_AVX2) | |
return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64); | |
#elif defined(UPNG__ARCH_X64) | |
int cpuid = upng__cpuid(); | |
if (cpuid & UPNG__CPUID_SSE41) | |
{ | |
return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64); | |
} | |
#elif defined(UPNG__ARCH_ARM64) | |
return upng__unrow1_arm64(dst, src, last, size, inc, shuffle64); | |
#else | |
(void)shuffle64; | |
#endif | |
return 0; | |
} | |
// handles RGB8, BGR8, RGB16, BGR16 | |
static size_t upng__unrow3(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format) | |
{ | |
const uint8_t(*shuffle)[16]; | |
switch (format) | |
{ | |
case UPNG_FORMAT_RGB8: | |
shuffle = upng__shuffle_RGB8; | |
break; | |
case UPNG_FORMAT_BGR8: | |
shuffle = upng__shuffle_BGR8; | |
break; | |
case UPNG_FORMAT_RGB16: | |
shuffle = upng__shuffle_RGB16; | |
break; | |
case UPNG_FORMAT_BGR16: | |
shuffle = upng__shuffle_BGR16; | |
break; | |
default: | |
shuffle = NULL; | |
break; | |
} | |
#if defined(UPNG__ARCH_X64) | |
int cpuid = upng__cpuid(); | |
if (cpuid & UPNG__CPUID_SSE41) | |
{ | |
return upng__unrow3_sse4(dst, src, last, size, inc, shuffle); | |
} | |
#elif defined(UPNG__ARCH_ARM64) | |
return upng__unrow3_arm64(dst, src, last, size, inc, shuffle); | |
#else | |
(void)shuffle; | |
#endif | |
return 0; | |
} | |
static void upng__unrow(uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter) | |
{ | |
// NONE filter is same as UP when previous row is all 0 values | |
static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 }; | |
const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : dst - pitch; | |
size_t inc; | |
size_t used; | |
if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16) | |
{ | |
inc = filter == UPNG_FILTER_NONE ? 0 : 48; | |
used = upng__unrow3(dst, src, last, size, inc, format); | |
} | |
else | |
{ | |
inc = filter == UPNG_FILTER_NONE ? 0 : 16; | |
used = upng__unrow1(dst, src, last, size, inc, format); | |
} | |
last += inc == 0 ? 0 : used; | |
src += used; | |
dst += used; | |
size -= used; | |
// size < inc | |
switch (format) | |
{ | |
case UPNG_FORMAT_G8: // 16 pixels per 16 bytes | |
inc /= 16; // 0 or 1 | |
while (size != 0) | |
{ | |
dst[0] = src[0] + last[0]; | |
last += inc; | |
src += 1; | |
dst += 1; | |
size -= 1; | |
} | |
break; | |
case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes | |
inc /= 8; // 0 or 2 | |
while (size != 0) | |
{ | |
dst[0] = src[0] + last[0]; | |
dst[1] = src[1] + last[1]; | |
last += inc; | |
src += 2; | |
dst += 2; | |
size -= 2; | |
} | |
break; | |
case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes | |
inc /= 16; // 0 or 3 | |
while (size != 0) | |
{ | |
dst[0] = src[0] + last[0]; | |
dst[1] = src[1] + last[1]; | |
dst[2] = src[2] + last[2]; | |
last += inc; | |
src += 3; | |
dst += 3; | |
size -= 3; | |
} | |
break; | |
case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes | |
inc /= 16; // 0 or 3 | |
while (size != 0) | |
{ | |
dst[0] = src[2] + last[0]; | |
dst[1] = src[1] + last[1]; | |
dst[2] = src[0] + last[2]; | |
last += inc; | |
src += 3; | |
dst += 3; | |
size -= 3; | |
} | |
break; | |
case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[0] + last[0]; | |
dst[1] = src[1] + last[1]; | |
dst[2] = src[2] + last[2]; | |
dst[3] = src[3] + last[3]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[2] + last[0]; | |
dst[1] = src[1] + last[1]; | |
dst[2] = src[0] + last[2]; | |
dst[3] = src[3] + last[3]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_G16: // 8 pixels per 16 bytes | |
inc /= 8; // 0 or 2 | |
while (size != 0) | |
{ | |
dst[0] = src[1] + last[0]; | |
dst[1] = src[0] + last[1]; | |
last += inc; | |
src += 2; | |
dst += 2; | |
size -= 2; | |
} | |
break; | |
case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes | |
inc /= 4; // 0 or 4 | |
while (size != 0) | |
{ | |
dst[0] = src[1] + last[0]; | |
dst[1] = src[0] + last[1]; | |
dst[2] = src[3] + last[2]; | |
dst[3] = src[2] + last[3]; | |
last += inc; | |
src += 4; | |
dst += 4; | |
size -= 4; | |
} | |
break; | |
case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes | |
inc /= 8; // 0 or 6 | |
while (size != 0) | |
{ | |
dst[0] = src[1] + last[0]; | |
dst[1] = src[0] + last[1]; | |
dst[2] = src[3] + last[2]; | |
dst[3] = src[2] + last[3]; | |
dst[4] = src[5] + last[4]; | |
dst[5] = src[4] + last[5]; | |
last += inc; | |
src += 6; | |
dst += 6; | |
size -= 6; | |
} | |
break; | |
case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes | |
inc /= 8; // 0 or 6 | |
while (size != 0) | |
{ | |
dst[0] = src[5] + last[0]; | |
dst[1] = src[4] + last[1]; | |
dst[2] = src[3] + last[2]; | |
dst[3] = src[2] + last[3]; | |
dst[4] = src[1] + last[4]; | |
dst[5] = src[0] + last[5]; | |
last += inc; | |
src += 6; | |
dst += 6; | |
size -= 6; | |
} | |
break; | |
case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes | |
inc /= 2; // 0 or 8 | |
while (size != 0) | |
{ | |
dst[0] = src[1] + last[0]; | |
dst[1] = src[0] + last[1]; | |
dst[2] = src[3] + last[2]; | |
dst[3] = src[2] + last[3]; | |
dst[4] = src[5] + last[4]; | |
dst[5] = src[4] + last[5]; | |
dst[6] = src[7] + last[6]; | |
dst[7] = src[6] + last[7]; | |
last += inc; | |
src += 8; | |
dst += 8; | |
size -= 8; | |
} | |
break; | |
case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes | |
inc /= 2; // 0 or 8 | |
while (size != 0) | |
{ | |
dst[0] = src[5] + last[0]; | |
dst[1] = src[4] + last[1]; | |
dst[2] = src[3] + last[2]; | |
dst[3] = src[2] + last[3]; | |
dst[4] = src[1] + last[4]; | |
dst[5] = src[0] + last[5]; | |
dst[6] = src[7] + last[6]; | |
dst[7] = src[6] + last[7]; | |
last += inc; | |
src += 8; | |
dst += 8; | |
size -= 8; | |
} | |
break; | |
} | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
static const char upng__sig[] = "\x89PNG\r\n\x1a\n"; | |
static const size_t upng__ihdr_size = 13; | |
// max chunk size | |
static const size_t upng__max_chunk_size = (1U << 31) - 1; | |
static const uint32_t upng__bpp[] = | |
{ | |
1, // G8 | |
2, // GA8 | |
3, // RGB8 | |
3, // BGR8 | |
4, // RGBA8 | |
4, // BGRA8 | |
2, // G16 | |
4, // GA16 | |
6, // RGB16 | |
6, // BGR16 | |
8, // RGBA16 | |
8, // BGRA16 | |
}; | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter) | |
{ | |
if (width == 0 || height == 0) | |
{ | |
// bad width or height | |
return 0; | |
} | |
const uint32_t bpp = upng__bpp[format]; | |
if (pitch == 0) | |
{ | |
pitch = width * bpp; | |
} | |
if ((size_t)width * height < (size_t)width || (size_t)width * height >= (size_t)1 << 48) | |
{ | |
// width and height too big | |
return 0; | |
} | |
if ((size_t)pitch * bpp < (size_t)pitch) | |
{ | |
// pitch too large, overflows size_t | |
return 0; | |
} | |
static const char iend_chunk[] = "\0\0\0\0IEND\xae\x42\x60\x82"; | |
// max zlib block size | |
const uint32_t max_block_size = 65535; | |
// how many pixels fit into one zlib block (conservative estimate, because of filter byte) | |
const uint32_t pixels_per_block = (max_block_size - 1) / bpp; | |
// how many full zlib blocks needed per row | |
size_t full_block_count = width / pixels_per_block; | |
// how many pixels are left | |
size_t tail_block_pixels = width % pixels_per_block; | |
// how many bytes in full zlib blocks | |
size_t full_block_size = full_block_count * (1 + 4 + pixels_per_block * bpp); | |
// how many bytes in last zlib block | |
size_t last_block_size = tail_block_pixels ? (1 + 4 + tail_block_pixels * bpp) : 0; | |
// total size per row including filter byte | |
size_t size_per_row = 1 + full_block_size + last_block_size; | |
// how many rows fit into IDAT chunk | |
size_t rows_per_idat = upng__max_chunk_size / size_per_row; | |
if (rows_per_idat == 0) | |
{ | |
// code assumes it can fit at least one full row (including zlib block headers) into IDAT | |
return 0; | |
} | |
if (!dst) | |
{ | |
size_t size = 0; | |
// png signature | |
size += sizeof(upng__sig) - 1; | |
// IHDR chunk | |
size += 4 + 4 + upng__ihdr_size + 4; | |
// first IDAT chunk contains 2 zlib header bytes | |
size += 4 + 4 + 2 + 4; | |
// how many full IDAT chunks | |
size_t full_idat_count = height / rows_per_idat; | |
// how many rows in last IDAT | |
size_t tail_idat_rows = height % rows_per_idat; | |
size += (4 + 4 + rows_per_idat * size_per_row + 4) * full_idat_count; | |
size += tail_idat_rows ? (4 + 4 + tail_idat_rows * size_per_row + 4) : 0; | |
// last IDAT chunk with empty zlib block & adler32 | |
size += 4 + 4 + (1 + 4) + (4) + 4; | |
// IEND chunk | |
size += sizeof(iend_chunk) - 1; | |
return size; | |
} | |
upng__crc32_init(); | |
uint8_t* out = (uint8_t*)dst; | |
// file signature, https://www.w3.org/TR/png/#5PNG-file-signature | |
for (size_t i = 0; i < sizeof(upng__sig) - 1; i++) | |
{ | |
*out++ = (uint8_t)upng__sig[i]; | |
} | |
// IHDR, https://www.w3.org/TR/png/#11IHDR | |
{ | |
// https://www.w3.org/TR/png/#6Colour-values | |
static const uint8_t bits[] = { 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16 }; | |
static const uint8_t type[] = { 0, 4, 2, 2, 6, 6, 0, 4, 2, 2, 6, 6 }; | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = (uint8_t)upng__ihdr_size; | |
*out++ = (uint8_t)'I'; | |
*out++ = (uint8_t)'H'; | |
*out++ = (uint8_t)'D'; | |
*out++ = (uint8_t)'R'; | |
*out++ = (uint8_t)(width >> 24); | |
*out++ = (uint8_t)(width >> 16); | |
*out++ = (uint8_t)(width >> 8); | |
*out++ = (uint8_t)(width); | |
*out++ = (uint8_t)(height >> 24); | |
*out++ = (uint8_t)(height >> 16); | |
*out++ = (uint8_t)(height >> 8); | |
*out++ = (uint8_t)(height); | |
*out++ = bits[format]; | |
*out++ = type[format]; | |
*out++ = 0; // zlib compression | |
*out++ = 0; // filter method | |
*out++ = 0; // no interlace | |
uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - upng__ihdr_size - 4, upng__ihdr_size + 4); | |
*out++ = (uint8_t)(crc >> 24); | |
*out++ = (uint8_t)(crc >> 16); | |
*out++ = (uint8_t)(crc >> 8); | |
*out++ = (uint8_t)(crc); | |
} | |
// first IDAT contains just 2 bytes of zlib format | |
{ | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = 2; | |
*out++ = (uint8_t)'I'; | |
*out++ = (uint8_t)'D'; | |
*out++ = (uint8_t)'A'; | |
*out++ = (uint8_t)'T'; | |
*out++ = 0x78; // CM=8, CINFO=7 | |
*out++ = 0x01; // FCHECK=1, FDICT=0, FLEVEL=0 | |
uint32_t crc = 0xec1a7ed2; // crc32(out - 6, 6) | |
*out++ = (uint8_t)(crc >> 24); | |
*out++ = (uint8_t)(crc >> 16); | |
*out++ = (uint8_t)(crc >> 8); | |
*out++ = (uint8_t)(crc); | |
} | |
upng__idat idat; | |
idat.adler = UPNG__ADLER32_INIT; | |
for (size_t y0 = 0; y0 < height; y0 += rows_per_idat) | |
{ | |
size_t rows_in_idat = (height - y0) < rows_per_idat ? (height - y0) : rows_per_idat; | |
uint32_t idat_size = (uint32_t)(rows_in_idat * size_per_row); | |
// start of IDAT, https://www.w3.org/TR/png/#11IDAT | |
*out++ = (uint8_t)(idat_size >> 24); | |
*out++ = (uint8_t)(idat_size >> 16); | |
*out++ = (uint8_t)(idat_size >> 8); | |
*out++ = (uint8_t)(idat_size); | |
*out++ = (uint8_t)'I'; | |
*out++ = (uint8_t)'D'; | |
*out++ = (uint8_t)'A'; | |
*out++ = (uint8_t)'T'; | |
idat.crc = 0x35af061e; // crc32(out - 4, 4) | |
for (size_t yi = 0; yi < rows_in_idat; yi++) | |
{ | |
size_t y = y0 + yi; | |
// every row will always start on a new zlib block | |
for (size_t x = 0; x < width; x += pixels_per_block) | |
{ | |
// how many pixels to use | |
uint32_t pixel_count = (uint32_t)((width - x) < pixels_per_block ? (width - x) : pixels_per_block); | |
uint32_t pixel_size = pixel_count * bpp; | |
// include filter byte | |
uint32_t block_size = (x == 0 ? 1 : 0) + pixel_size; | |
*out++ = 0; // BFINAL=0, BTYPE=0 | |
*out++ = (uint8_t)(block_size); | |
*out++ = (uint8_t)(block_size >> 8); | |
*out++ = (uint8_t)(~block_size); | |
*out++ = (uint8_t)(~block_size >> 8); | |
idat.crc = upng__crc32(idat.crc, out - 5, 5); | |
// first row uses NONE, rest of them UP/NONE filter | |
upng_filter row_filter = y == 0 ? UPNG_FILTER_NONE : filter; | |
if (x == 0) | |
{ | |
// each row starts with filter byte | |
// https://www.w3.org/TR/png/#9Filter-types | |
*out++ = (uint8_t)row_filter; | |
idat.adler = upng__adler32(idat.adler, out - 1, 1); | |
idat.crc = upng__crc32(idat.crc, out - 1, 1); | |
} | |
const uint8_t* pix = (const uint8_t*)src + y * pitch + x * bpp; | |
upng__row(&idat, out, pix, pitch, pixel_size, format, row_filter); | |
out += pixel_size; | |
} | |
} | |
// end of IDAT | |
*out++ = (uint8_t)(idat.crc >> 24); | |
*out++ = (uint8_t)(idat.crc >> 16); | |
*out++ = (uint8_t)(idat.crc >> 8); | |
*out++ = (uint8_t)(idat.crc); | |
} | |
// one more IDAT with empty zlib block (1+4 bytes) and adler32 checksum (4 bytes) | |
{ | |
uint32_t idat_size = 1 + 4 + 4; | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = 0; | |
*out++ = (uint8_t)idat_size; | |
*out++ = (uint8_t)'I'; | |
*out++ = (uint8_t)'D'; | |
*out++ = (uint8_t)'A'; | |
*out++ = (uint8_t)'T'; | |
*out++ = 1; // BFINAL=1, BTYPE=0 | |
*out++ = 0; // LEN = 0x0000 | |
*out++ = 0; | |
*out++ = 0xff; // NLEN = ~LEN = 0xffff | |
*out++ = 0xff; | |
*out++ = (uint8_t)(idat.adler >> 24); | |
*out++ = (uint8_t)(idat.adler >> 16); | |
*out++ = (uint8_t)(idat.adler >> 8); | |
*out++ = (uint8_t)(idat.adler); | |
uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - (4 + idat_size), 4 + idat_size); | |
*out++ = (uint8_t)(crc >> 24); | |
*out++ = (uint8_t)(crc >> 16); | |
*out++ = (uint8_t)(crc >> 8); | |
*out++ = (uint8_t)(crc); | |
} | |
// IEND, https://www.w3.org/TR/png/#11IEND | |
for (size_t i = 0; i < sizeof(iend_chunk) - 1; i++) | |
{ | |
*out++ = (uint8_t)iend_chunk[i]; | |
} | |
return out - (uint8_t*)dst; | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | |
typedef struct { | |
const uint8_t* in; | |
size_t size; | |
// how many bytes available in current IDAT chunk | |
uint32_t chunk_size; | |
// how many bytes available in current zlib block | |
uint32_t block_size; | |
// BFINAL bit of last zlib block | |
int bfinal; | |
// temporary buffer used to read data that could cross IDAT chunk boundaries | |
// 2 bytes for zlib format header in the beginning | |
// 4 bytes adler32 checksum at the end | |
// 5 bytes for zlib block header | |
uint8_t temp[5]; | |
uint32_t temp_size; | |
// max 8 bytes of one pixel (RGBA16) that could potentially be split across zlib blocks | |
uint8_t split_block[8]; | |
uint32_t split_size; | |
} upng__data; | |
#define UPNG__CHUNK_IHDR 0x52444849 | |
#define UPNG__CHUNK_IDAT 0x54414449 | |
#define UPNG__CHUNK_IEND 0x444e4549 | |
// parses next chunk from input data, returns 4 bytes of chunk type, sets "chunk_size" in "data" | |
static uint32_t upng__next_chunk(upng__data* data) | |
{ | |
// skip crc32 of previous chunk | |
data->in += 4; | |
data->size -= 4; | |
if (data->size < 4 + 4 + 4) | |
{ | |
// not enough input for chunk size/type/crc | |
return 0; | |
} | |
const uint8_t* in = data->in; | |
data->chunk_size = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3]; | |
data->in += 4; | |
data->size -= 4; | |
// chunk type | |
data->in += 4; | |
data->size -= 4; | |
if (data->chunk_size > upng__max_chunk_size || data->size < data->chunk_size + 4) | |
{ | |
// bad chunk size, or not enough input provided | |
return 0; | |
} | |
return in[4] | (in[5] << 8) | (in[6] << 16) | (in[7] << 24); | |
} | |
// provides exactly "size" bytes from IDAT chunk payload bytes | |
static const uint8_t* upng__data_expect(upng__data* data, uint32_t size) | |
{ | |
if (size <= data->chunk_size) | |
{ | |
// there are enough bytes in current IDAT chunk | |
return data->in; | |
} | |
// otherwise data is split across multiple IDAT chunks | |
uint32_t temp_size = 0; | |
for (;;) | |
{ | |
// copy available bytes to temp buffer | |
uint32_t avail = size < data->chunk_size ? (uint32_t)size : data->chunk_size; | |
for (size_t i = 0; i < avail; i++) | |
{ | |
data->temp[temp_size++] = *data->in++; | |
} | |
size -= avail; | |
data->size -= avail; | |
data->chunk_size -= avail; | |
if (size == 0) | |
{ | |
break; | |
} | |
// find next non-empty IDAT chunk | |
while (data->chunk_size == 0) | |
{ | |
if (upng__next_chunk(data) != UPNG__CHUNK_IDAT) | |
{ | |
// not enough input | |
return NULL; | |
} | |
} | |
if (temp_size == 0 && size <= data->chunk_size) | |
{ | |
// amount of bytes requested fully fit into first chunk (because current one was empty) | |
return data->in; | |
} | |
} | |
data->temp_size = temp_size; | |
return data->temp; | |
} | |
// consumes "size" bytes from IDAT chunk payload | |
static void upng__data_use(upng__data* data, uint32_t size) | |
{ | |
if (data->temp_size == 0) | |
{ | |
// consume bytes from IDAT chunk payload | |
data->in += size; | |
data->size -= size; | |
data->chunk_size -= size; | |
} | |
else | |
{ | |
// consume bytes from "temp" | |
data->temp_size = 0; | |
} | |
} | |
// provides at least "min_size" bytes from zlib block payload, returns actually usable amount in "size" | |
static const uint8_t* upng__block_expect(upng__data* data, uint32_t min_size, uint32_t* size) | |
{ | |
uint32_t avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size; | |
if (min_size <= avail_size) | |
{ | |
// there are enough bytes | |
*size = avail_size; | |
return data->in; | |
} | |
// otherwise data is split across multiple zlib blocks | |
uint32_t split_size = 0; | |
for (;;) | |
{ | |
// copy available bytes into split_block | |
uint32_t avail = min_size < avail_size ? min_size : avail_size; | |
for (size_t i = 0; i < avail; i++) | |
{ | |
data->split_block[split_size++] = data->in[i]; | |
} | |
upng__data_use(data, avail); | |
data->block_size -= avail; | |
min_size -= avail; | |
avail_size -= avail; | |
if (min_size == 0) | |
{ | |
break; | |
} | |
// in case zlib block spans multiple IDAT chunks, read next IDAT chunk | |
if (data->chunk_size == 0) | |
{ | |
// find next non-empty IDAT chunk | |
while (data->chunk_size == 0) | |
{ | |
if (upng__next_chunk(data) != UPNG__CHUNK_IDAT) | |
{ | |
// not enough input | |
return NULL; | |
} | |
} | |
} | |
// find next non-empty zlib block | |
while (data->block_size == 0) | |
{ | |
if (data->bfinal) | |
{ | |
// after BFINAL=1 there are no more zlib blocks expected | |
return NULL; | |
} | |
const uint8_t* in; | |
if (!(in = upng__data_expect(data, 5))) | |
{ | |
// not enough input | |
return NULL; | |
} | |
uint8_t block_type = in[0]; | |
if ((block_type >> 1) != 0) | |
{ | |
// upng supports only uncompressed zlib blocks (BTYPE=0) | |
return NULL; | |
} | |
uint16_t block_size = in[1] | (in[2] << 8); | |
uint16_t size_check = in[3] | (in[4] << 8); | |
if ((uint16_t)block_size != (uint16_t)~size_check) | |
{ | |
// bad zlib block size (LEN/NLEN) | |
return NULL; | |
} | |
upng__data_use(data, 5); | |
data->bfinal = block_type & 1; | |
data->block_size = block_size; | |
} | |
avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size; | |
if (split_size == 0 && min_size <= avail_size) | |
{ | |
// requested amount of bytes fully fit into first block (becuase current one was empty) | |
*size = avail_size; | |
return data->in; | |
} | |
} | |
data->split_size = split_size; | |
*size = split_size; | |
return data->split_block; | |
} | |
// consumes "size" bytes from zlib block payload | |
static void upng__block_use(upng__data* data, uint32_t size) | |
{ | |
if (data->split_size == 0) | |
{ | |
// consume bytes from zlib block | |
upng__data_use(data, size); | |
data->block_size -= size; | |
} | |
else | |
{ | |
// consume bytes from "split_block" | |
data->split_size = 0; | |
} | |
} | |
size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags) | |
{ | |
const uint8_t* in = (const uint8_t*)src; | |
// png signature | |
{ | |
if (size < sizeof(upng__sig) - 1) | |
{ | |
// not enough input bytes for png signature | |
return 0; | |
} | |
for (size_t i = 0; i < sizeof(upng__sig) - 1; i++) | |
{ | |
if (in[i] != (uint8_t)upng__sig[i]) | |
{ | |
// bad png signature | |
return 0; | |
} | |
} | |
in += sizeof(upng__sig) - 1; | |
size -= sizeof(upng__sig) - 1; | |
} | |
uint32_t w, h; | |
upng_format fmt; | |
upng__data data = { 0 }; | |
// back up by 4 bytes, because upng__next_chunk will skip crc32 | |
data.in = in - 4; | |
data.size = size + 4; | |
// IHDR chunk | |
{ | |
if (upng__next_chunk(&data) != UPNG__CHUNK_IHDR) | |
{ | |
// first chunk must be IHDR | |
return 0; | |
} | |
if (data.chunk_size != upng__ihdr_size) | |
{ | |
// bad IHDR size | |
return 0; | |
} | |
const uint8_t* in = data.in; | |
w = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3]; | |
h = (in[4] << 24) | (in[5] << 16) | (in[6] << 8) | in[7]; | |
uint8_t bits = in[8]; | |
uint8_t type = in[9]; | |
uint8_t compression = in[10]; | |
uint8_t filter = in[11]; | |
uint8_t interlace = in[12]; | |
data.in += upng__ihdr_size; | |
data.size -= upng__ihdr_size; | |
if (w == 0 || h == 0 // invalid width or height values | |
|| (size_t)w * h < (size_t)w // width * height overflows | |
|| (size_t)w * h >(size_t)1 << 48) // width * height takes too much memory | |
{ | |
return 0; | |
} | |
int bgr = !!(flags & UPNG_READ_SWAP_TO_BGR); | |
if (bits == 8) | |
{ | |
switch (type) | |
{ | |
case 0: fmt = UPNG_FORMAT_G8; break; | |
case 2: fmt = bgr ? UPNG_FORMAT_BGR8 : UPNG_FORMAT_RGB8; break; | |
case 4: fmt = UPNG_FORMAT_GA8; break; | |
case 6: fmt = bgr ? UPNG_FORMAT_BGRA8 : UPNG_FORMAT_RGBA8; break; | |
default: return 0; // unsupported 8-bit format | |
} | |
} | |
else if (bits == 16) | |
{ | |
switch (type) | |
{ | |
case 0: fmt = UPNG_FORMAT_G16; break; | |
case 2: fmt = bgr ? UPNG_FORMAT_BGR16 : UPNG_FORMAT_RGB16; break; | |
case 4: fmt = UPNG_FORMAT_GA16; break; | |
case 6: fmt = bgr ? UPNG_FORMAT_BGRA16 : UPNG_FORMAT_RGBA16; break; | |
default: return 0; // unsupported 16-bit format | |
} | |
} | |
else | |
{ | |
// unsupported bit count | |
return 0; | |
} | |
if (compression != 0) | |
{ | |
// unsupported compression method | |
return 0; | |
} | |
if (filter != 0) | |
{ | |
// unsupported filter method | |
return 0; | |
} | |
if (interlace != 0) | |
{ | |
// unsupported interlace method | |
return 0; | |
} | |
*width = w; | |
*height = h; | |
*format = fmt; | |
} | |
const uint32_t bpp = upng__bpp[fmt]; | |
if (pitch == 0) | |
{ | |
if ((size_t)w * bpp < (size_t)bpp) | |
{ | |
// width too large, overflows size_t | |
return 0; | |
} | |
pitch = (size_t)w * bpp; | |
} | |
if (h * pitch < pitch) | |
{ | |
// pitch too large, overflows size_t | |
return 0; | |
} | |
if (dst == NULL) | |
{ | |
// done! only IHDR info requested | |
return h * pitch; | |
} | |
// skip chunks until first IDAT chunk | |
for (;;) | |
{ | |
uint32_t type = upng__next_chunk(&data); | |
if (type == UPNG__CHUNK_IDAT) | |
{ | |
break; | |
} | |
else if ((char)type < 'a' || (char)type > 'z') | |
{ | |
// only "non-critical" chunks allowed | |
return 0; | |
} | |
// ignore optional chunk payload | |
data.in += data.chunk_size; | |
data.size -= data.chunk_size; | |
} | |
// IDAT chunk payload starts with 2 bytes for zlib format | |
{ | |
if (!(in = upng__data_expect(&data, 2))) | |
{ | |
// not enough input | |
return 0; | |
} | |
uint32_t cmf = in[0]; // CM & CINFO | |
uint32_t flg = in[1]; // FCHECK, FDICT, FLEVEL | |
if ((cmf & 0xf) != 0x8) | |
{ | |
// CM must be 8 = deflate compression method | |
return 0; | |
} | |
if (flg & (1 << 5)) | |
{ | |
// FDICT must be 0 = no dictionary | |
return 0; | |
} | |
if ((cmf * 256 + flg) % 31 != 0) | |
{ | |
// bad FCHECK value | |
return 0; | |
} | |
upng__data_use(&data, 2); | |
} | |
for (size_t y = 0; y < h; y++) | |
{ | |
uint32_t in_avail; | |
if (!(in = upng__block_expect(&data, 1, &in_avail))) | |
{ | |
// not enough input for row filter byte | |
return 0; | |
} | |
uint8_t row_filter = in[0]; | |
if (row_filter != UPNG_FILTER_NONE && row_filter != UPNG_FILTER_UP) | |
{ | |
// upng supports only NONE and UP filters | |
return 0; | |
} | |
upng__block_use(&data, 1); | |
if (row_filter == UPNG_FILTER_UP && y == 0) | |
{ | |
// if first row uses UP filter, force it to use NONE | |
row_filter = UPNG_FILTER_NONE; | |
} | |
uint32_t x = 0; | |
while (x < w) | |
{ | |
if (!(in = upng__block_expect(&data, bpp, &in_avail))) | |
{ | |
// not enough input for at least one more pixel | |
return 0; | |
} | |
uint32_t pixel_count = in_avail / bpp; | |
pixel_count = w - x < pixel_count ? w - x : pixel_count; | |
uint32_t pixel_size = pixel_count * bpp; | |
uint8_t* pix_out = (uint8_t*)dst + y * pitch + x * bpp; | |
upng__unrow(pix_out, in, pitch, pixel_size, fmt, (upng_filter)row_filter); | |
x += pixel_count; | |
upng__block_use(&data, pixel_size); | |
} | |
} | |
if (data.block_size != 0) | |
{ | |
// no more bytes expected in zlib payload | |
return 0; | |
} | |
// expect empty zlib blocks until BFINAL=1 | |
while (data.bfinal == 0) | |
{ | |
if (!(in = upng__data_expect(&data, 5))) | |
{ | |
// not enough input | |
return 0; | |
} | |
uint8_t block_type = in[0]; | |
if ((block_type >> 1) != 0) | |
{ | |
// upng supports only uncompressed zlib blocks (BTYPE=0) | |
return 0; | |
} | |
uint16_t block_size = in[1] | (in[2] << 8); | |
uint16_t size_check = in[3] | (in[4] << 8); | |
if (block_size != 0 || size_check != 0xffff) | |
{ | |
// expected 0-sized zlib block | |
return 0; | |
} | |
upng__data_use(&data, 5); | |
data.bfinal = block_type & 1; | |
} | |
// skip adler32 checksum | |
if (!(in = upng__data_expect(&data, 4))) | |
{ | |
return 0; | |
} | |
upng__data_use(&data, 4); | |
if (data.chunk_size != 0) | |
{ | |
// no more bytes expected in IDAT chunks | |
return 0; | |
} | |
// skip until IEND chunk | |
for (;;) | |
{ | |
uint32_t type = upng__next_chunk(&data); | |
if (type == UPNG__CHUNK_IEND) | |
{ | |
break; | |
} | |
else if ((char)type < 'a' || (char)type > 'z') | |
{ | |
// only "non-critical" chunks allowed | |
return 0; | |
} | |
// ignore optional chunk payload | |
data.in += data.chunk_size; | |
data.size -= data.chunk_size; | |
} | |
if (data.chunk_size != 0) | |
{ | |
// IEND chunk size must be 0 | |
return 0; | |
} | |
// ignore crc32 of IEND chunk | |
if (data.size != 4) | |
{ | |
// unexpected length of input | |
return 0; | |
} | |
// OK! | |
return h * pitch; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment