mmozeiko · October 8, 2024 03:47
diff --git a/upng.h b/upng.h
 #pragma once

 // uncompressed png writer & reader
 // supports only 8-bit and 16-bit formats

 // Performance comparison for 8192x8192 BGRA8 image (256MB)
 // Compiled with "clang -O2", AVX2 requires extra "-mavx2" or "/arch:AVX2" argument
 //
 // For libpng (compressed) uses default libpng/zlib compression settings
 // For libpng (uncompressed) case following two functions are used:
 //   png_set_compression_level() with Z_NO_COMPRESSION
 //   png_set_filter() with PNG_FILTER_NONE
 //
 // Ryzen 5950x
 //   upng (AVX2)           =    22.9 msec (11157.3 MB/s), read =  20.5 msec (12499.4 MB/s)
 //   upng                  =    27.7 msec ( 9254.6 MB/s), read =  20.8 msec (12296.8 MB/s)
 //   libpng (uncompressed) =   169.9 msec ( 1506.9 MB/s), read = 167.5 msec ( 1528.6 MB/s)
 //   libpng (compressed)   =  2148.1 msec (  119.2 MB/s), read = 503.5 msec (  508.4 MB/s)
 //
 // Raspberry Pi4 (-march=armv8+crc)
 //   upng                  =  182.9 msec (1399.7 MB/s), read =  110.8 msec (2310.8 MB/s)
 //   libpng (uncompressed) = 1192.7 msec ( 214.6 MB/s), read = 1211.8 msec ( 211.3 MB/s)
 //   libpng (compressed)   = 9396.8 msec (  27.2 MB/s), read = 1874.6 msec ( 136.6 MB/s)
 //
 // Apple M1 (-march=armv8+crc+crypto)
 //   upng                  =   22.2 msec (11523.7 MB/s), read =  8.9 msec (28622.5 MB/s)
 //   libpng (uncompressed) =   93.3 msec ( 2743.3 MB/s), read = 66.6 msec ( 3841.8 MB/s)
 //   libpng (compressed)   = 2038.6 msec (  125.6 MB/s), read = 90.4 msec ( 2832.5 MB/s)

 #include <stddef.h>
 #include <stdint.h>

 typedef enum {
 	UPNG_FORMAT_G8,
 	UPNG_FORMAT_GA8,
 	UPNG_FORMAT_RGB8,
 	UPNG_FORMAT_BGR8,
 	UPNG_FORMAT_RGBA8,
 	UPNG_FORMAT_BGRA8,
 	UPNG_FORMAT_G16,
 	UPNG_FORMAT_GA16,
 	UPNG_FORMAT_RGB16,
 	UPNG_FORMAT_BGR16,
 	UPNG_FORMAT_RGBA16,
 	UPNG_FORMAT_BGRA16,
 } upng_format;

 typedef enum {
 	UPNG_FILTER_NONE = 0,
 	UPNG_FILTER_UP = 2,
 } upng_filter;

 // if `dst` is NULL then function will quickly return size needed for `dst` (`src` won't be used)
 // if `pitch` is 0, then pixels in `src` are tightly packed without any padding bytes between rows
 // returns 0 for unsupported parameter values
 static size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter);

 // output BGR/BGRA format instead of RGB/RGBA
 #define UPNG_READ_SWAP_TO_BGR 1

 // if `dst` is NULL then function will quickly return `width` / `height` / `format` values from png header
 // if `pitch` is 0, then pixels in `dst` will be tightly packed without any padding bytes between rows
 // returns total size of image - `pitch` multiplied by `height`
 // returns 0 if png file cannot be successfully parsed or is unsupported
 // function does NOT verify CRC32 or ADLER32 checksums
 static size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags);

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 // optional defines:
 // UPNG_DISABLE_AVX2 - do not use AVX2 codepath, even if AVX2 is allowed by compiler
 // UPNG_USE_ARM64_PMUL - prefer to use ARM64 PMUL instruction instead of CRC32 on non-Apple targets
 //                       this may be slower than using CRC32 instruction, depends on CPU

 #if defined(_M_AMD64) || defined(__x86_64__)
 #	define UPNG__ARCH_X64
 #elif defined(_M_ARM64) || defined(__aarch64__)
 #	define UPNG__ARCH_ARM64
 #endif

 #if defined(UPNG__ARCH_X64)
 #	if defined(__clang__) || defined(__GNUC__)
 #		include <cpuid.h>
 #		define UPNG__CPUID(num, regs) __cpuid(num, regs[0], regs[1], regs[2], regs[3])
 #		define UPNG__CPUID2(num, sub, regs) __cpuid_count(num, sub, regs[0], regs[1], regs[2], regs[3])
 #		define UPNG__TARGET(str) __attribute__((target(str)))
 #	else
 #		include <intrin.h>
 #		define UPNG__CPUID(num, regs) __cpuid(regs, num)
 #		define UPNG__CPUID2(num, sub, regs) __cpuidex(regs, num, sub)
 #		define UPNG__TARGET(str)
 #	endif
 #	if defined(__AVX2__) && !defined(UPNG_DISABLE_AVX2)
 #		define UPNG__ARCH_X64_AVX2
 #		include <immintrin.h>
 #		if !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER > 1930 && _MSC_VER < 1936)
 			// broken MSVC versions that do not generate VEX encoded VPCLMULQDQ instruction
 			// see https://developercommunity.visualstudio.com/t/_mm_clmulepi64_si128-intrinsic-no-longer/10277103
 #			pragma message("WARNING: this MSVC compiler version produces very bad performance with AVX2 codegen!")
 #			undef UPNG__ARCH_X64_AVX2
 #		elif !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER == 1938)
 			// broken MSVC version that generate AVX512 instructions in AVX2 code
 			// see https://developercommunity.visualstudio.com/t/Invalid-AVX512-instructions-generated-wh/10521872
 #			pragma message("WARNING: this MSVC compiler version produces invalid instructions with AVX2 codegen!")
 #			undef UPNG__ARCH_X64_AVX2
 #		endif
 #	endif
 #	include <wmmintrin.h>	// CLMUL	// _mm_clmulepi64_si128
 #	include <smmintrin.h>	// SSSE4.1	// _mm_extract_epi32
 #	include <tmmintrin.h>	// SSSE3	// _mm_maddubs_epi16, _mm_hadd_epi32, _mm_shuffle_epi8
 #	include <emmintrin.h>	// SSE2
 #elif defined(UPNG__ARCH_ARM64)
 #	include <arm_neon.h>
 #	if __ARM_FEATURE_CRC32				// use -march=armv8-a+crc when possible
 #		define UPNG__ARM64_CRC32		// __crc32d, __crc32b
 #		include <arm_acle.h>
 #	endif
 #	if __ARM_FEATURE_CRYPTO				// use -march=armv8-a+crypto when possible
 #		if defined(__APPLE__) || defined(UPNG_USE_ARM64_PMUL) || !defined(UPNG__ARM64_CRC32)
 #			define UPNG__ARM64_CRYPTO	// vmull_p64, vmull_high_p64
 #		endif
 #	endif
 #endif

 #if defined(_MSC_VER) && !defined(__clang__)
 #	include <intrin.h>
 #	define UPNG__ALIGN(n, var) __declspec(align(n)) var
 #	define UPNG__MSVC_BARRIER() _ReadWriteBarrier()
 #	define UPNG__ASSUME_ALIGNED(ptr, align) (ptr)
 #else
 #	define UPNG__ALIGN(n, var) var __attribute__((aligned(n)))
 #	define UPNG__MSVC_BARRIER() // not need for non-MSVC compiler
 #	define UPNG__ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned(ptr, align)
 #endif

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #define UPNG__ADLER32_INIT 1U

 #define UPNG__ADLER32_MOD 65521

 // max amount of bytes to not overflow "b" as uint32_t
 // max "b" value = 255*n*(n+1)/2 + (n+1)*(65521-1)
 #define UPNG__ADLER32_CHUNK_SIZE 5552

 // max amount of 16-byte blocks to use for SIMD
 #define UPNG__ADLER32_BLOCKS1 (UPNG__ADLER32_CHUNK_SIZE / 16)
 #define UPNG__ADLER32_BLOCKS3 (UPNG__ADLER32_CHUNK_SIZE / 48)
 #define UPNG__ADLER32_BLOCKS4 (UPNG__ADLER32_CHUNK_SIZE / 64)

 static uint32_t upng__adler32(uint32_t adler, const void* ptr, size_t size)
 {
 	const uint8_t* bytes = (const uint8_t*)ptr;

 	uint32_t a = adler & 0xffff;
 	uint32_t b = (adler >> 16);

 	// no SIMD here, it'll be used either for small chunk sizes only or without SIMD
 	while (size >= UPNG__ADLER32_CHUNK_SIZE)
 	{
 		for (size_t k = 0; k < UPNG__ADLER32_CHUNK_SIZE; k++)
 		{
 			a += *bytes++;
 			b += a;
 		}
 		size -= UPNG__ADLER32_CHUNK_SIZE;

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	while (size-- != 0)
 	{
 		a += *bytes++;
 		b += a;
 	}

 	a %= UPNG__ADLER32_MOD;
 	b %= UPNG__ADLER32_MOD;

 	return a | (b << 16);
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #define UPNG__CRC32_INIT 0U

 #if !defined(UPNG__CRC32_TABLE_COUNT)
 #	if defined(UPNG__ARM64_CRC32)
 #		define UPNG__CRC32_TABLE_COUNT 0 // no need for CRC32 tables if ACLE crc32 instruction can be used
 #	elif defined(UPNG__ARCH_X64)
 #		define UPNG__CRC32_TABLE_COUNT 16 // for x64 use 16KB table (half of L1 cache)
 #	else
 #		define UPNG__CRC32_TABLE_COUNT 8  // otherwise be safe and use only 8KB, alternatively set to 4 for 4KB table
 #	endif
 #endif

 #if UPNG__CRC32_TABLE_COUNT != 0
 static uint32_t upng__crc32_table[UPNG__CRC32_TABLE_COUNT][256];
 #endif

 static void upng__crc32_init(void)
 {
 #if UPNG__CRC32_TABLE_COUNT != 0
 	static int init = 0;
 	if (!init)
 	{
 		const uint32_t CRC32_POLY = 0xedb88320;

 		for (size_t i = 0; i < 256; i++)
 		{
 			uint32_t crc = (uint32_t)i;
 			for (size_t j = 0; j < 8; j++)
 			{
 				crc = (crc >> 1) ^ (crc & 1 ? CRC32_POLY : 0);
 			}
 			upng__crc32_table[0][i] = crc;
 		}

 		for (size_t i = 1; i < UPNG__CRC32_TABLE_COUNT; i++)
 		{
 			for (size_t j = 0; j < 256; j++)
 			{
 				upng__crc32_table[i][j] = (upng__crc32_table[i - 1][j] >> 8) ^ upng__crc32_table[0][upng__crc32_table[i - 1][j] & 0xff];
 			}
 		}
 		init = 1;
 	}
 #endif
 }

 static uint32_t upng__crc32(uint32_t crc, const void* ptr, size_t size)
 {
 	const uint8_t* bytes = (const uint8_t*)ptr;
 	crc = ~crc;

 	// no SIMD here, it'll be used either for small chunk sizes only or without SIMD

 #if defined(UPNG__ARM64_CRC32)
 	while (size-- != 0)
 	{
 		crc = __crc32b(crc, *bytes++);
 	}
 #else

 	while ((((uintptr_t)bytes % 4) != 0) && (size != 0))
 	{
 		crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
 		size -= 1;
 	}

 	// now bytes pointer is 4-byte aligned
 	const uint32_t* bytes4 = (const uint32_t*)UPNG__ASSUME_ALIGNED(bytes, 4);

 #if UPNG__CRC32_TABLE_COUNT == 16
 	while (size >= 16)
 	{
 		uint32_t b0 = *bytes4++ ^ crc;
 		uint32_t b1 = *bytes4++;
 		uint32_t b2 = *bytes4++;
 		uint32_t b3 = *bytes4++;
 		size -= 16;

 		// these barriers should not affect anything, but they make MSVC(2022) to generate ~25% faster code
 		UPNG__MSVC_BARRIER();
 		crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
 	}
 #elif UPNG__CRC32_TABLE_COUNT == 8
 	while (size >= 8)
 	{
 		uint32_t b0 = *bytes4++ ^ crc;
 		uint32_t b1 = *bytes4++;
 		size -= 8;

 		size_t i0 = (b1 >> 24) & 0xff;
 		size_t i1 = (b1 >> 16) & 0xff;
 		size_t i2 = (b1 >> 8) & 0xff;
 		size_t i3 = b1 & 0xff;
 		size_t i4 = (b0 >> 24) & 0xff;
 		size_t i5 = (b0 >> 16) & 0xff;
 		size_t i6 = (b0 >> 8) & 0xff;
 		size_t i7 = b0 & 0xff;

 		// similar situation to 16 table count - this make MSVC(2022) to generate ~25% faster code
 		UPNG__MSVC_BARRIER();
 		crc = upng__crc32_table[0][i0] ^ upng__crc32_table[4][i4];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[1][i1] ^ upng__crc32_table[5][i5];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[2][i2] ^ upng__crc32_table[6][i6];
 		UPNG__MSVC_BARRIER();
 		crc ^= upng__crc32_table[3][i3] ^ upng__crc32_table[7][i7];
 	}
 #elif UPNG__CRC32_TABLE_COUNT == 4
 	while (size >= 4)
 	{
 		uint32_t b0 = *bytes4++ ^ crc;
 		size -= 4;
 		crc = upng__crc32_table[0][(b0 >> 24) & 0xff] ^ upng__crc32_table[1][(b0 >> 16) & 0xff] ^ upng__crc32_table[2][(b0 >> 8) & 0xff] ^ upng__crc32_table[3][b0 & 0xff];
 	}
 #endif
 	bytes = (const uint8_t*)bytes4;
 	while (size-- != 0)
 	{
 		crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
 	}
 #endif

 	return ~crc;
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 typedef struct {
 	uint32_t crc;	// crc32 for whole chunk + 4 byte type
 	uint32_t adler;	// adler32 for zlib payload
 } upng__idat;

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #if defined(UPNG__ARCH_X64)

 #define UPNG__CPUID_SSE41	(1<<1) // SSSE3+SSE4.1
 #define UPNG__CPUID_CLMUL	(1<<2) // SSSE3+SSE4.1+CLMUL

 static int upng__cpuid(void)
 {
 	static int cpuid;
 	if (!cpuid)
 	{
 		int info[4];
 		UPNG__CPUID(1, info);

 		int detected = (1 << 0);

 		if (!!(info[3] & (1 << 9))) // SSSE3 bit
 		{
 			if (!!(info[3] & (1 << 19))) // SSE4.1 bit
 			{
 				detected |= UPNG__CPUID_SSE41;

 				if (!!(info[3] & (1 << 1))) // CLMUL bit
 				{
 					detected |= UPNG__CPUID_CLMUL;
 				}
 			}
 		}

 		cpuid = detected;
 	}
 	return cpuid;
 }

 static size_t UPNG__TARGET("ssse3,sse4.1")
 upng__row1_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	uint8_t* out = dst;
 	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	// adler32
 	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 	const __m128i ones = _mm_set1_epi16(1);
 	const __m128i zero = _mm_setzero_si128();

 	// on input
 	// a0 = a

 	// a = a0 + x[0]
 	// b += a0 + x[0]
 	// ...
 	// a = a0 + x[0] + x[1]
 	// b += 2*a0 + 2*x[0] + 1*x[1]
 	// ...
 	// a = a0 + x[0] + ... + x[N-1]
 	// b += N*a0 + N*x[0] + (N-1)*x[1] + ... + 2*x[N-2] + 1*x[N-1]

 	// processing every 16 bytes in an iteration (5552 is multiple of 16)
 	// va = a0 + (x[0]+...+x[15]) + (x[16]+..+x[31]) + ...
 	// vb = b + (16*x[0]+...+1*x[15]) + (16*x[16]+...+1*x[31]) + ... + (16*X[N-16]+...1*x[N-1])
 	// vs = n*a0 + (n-1)*(x[0]+...+x[15]) + (n-2)*(x[16]+...+x[31]) + ... + 1*(x[N-32]+...+x[N-17]) + 0*(x[N-16]+...+x[N-1])
 	// where n = N/16

 	// vs*16
 	// N*a0 + (N-16)*x[0]+...+(N-16)*x[15] + (N-32)*x[16]+...+(N-16)*x[31] + ... + 16*x[N-32]+...+16*x[N-17]

 	// vb+vs*16
 	// N*a0 + N*x[0] + (N-1)*x[1] + ... + 16*x[N-16] + 15*x[N-15] + ... + 1*x[N-1]

 	// for output
 	// a = va
 	// b = vb+vs*16

 	while (size >= 16)
 	{
 		__m128i vs = zero;
 		__m128i va = _mm_cvtsi32_si128(a);
 		__m128i vb = _mm_cvtsi32_si128(b);

 		// process as many 16-byte blocks as possible
 		size_t block_count = size / 16;
 		block_count = block_count < UPNG__ADLER32_BLOCKS1 ? block_count : UPNG__ADLER32_BLOCKS1;

 		for (size_t i = 0; i < block_count; i++)
 		{
 			// pixel filtering
 			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
 			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
 			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
 			_mm_storeu_si128((__m128i*)dst, vdst);
 			last += inc;
 			src += 16;
 			dst += 16;
 			size -= 16;

 			// adler32 update
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

 			// crc32 update
 			uint32_t b0 = _mm_extract_epi32(vdst, 0) ^ crc;
 			uint32_t b1 = _mm_extract_epi32(vdst, 1);
 			uint32_t b2 = _mm_extract_epi32(vdst, 2);
 			uint32_t b3 = _mm_extract_epi32(vdst, 3);
 			UPNG__MSVC_BARRIER();
 			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
 		}

 		// vb += vs * 16
 		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 		// a = sum(va)
 		va = _mm_hadd_epi32(va, va);
 		va = _mm_hadd_epi32(va, va);
 		a = _mm_cvtsi128_si32(va);

 		// b = sum(vb)
 		vb = _mm_hadd_epi32(vb, vb);
 		vb = _mm_hadd_epi32(vb, vb);
 		b = _mm_cvtsi128_si32(vb);

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	idat->adler = a | (b << 16);
 	idat->crc = ~crc;

 	return dst - out;
 }

 #if defined(UPNG__ARCH_X64_AVX2)

 static size_t UPNG__TARGET("ssse3,sse4.1,avx2,pclmul")
 upng__row1_avx2(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	if (size < 16)
 	{
 		return 0;
 	}

 	uint8_t* out = dst;
 	const __m128i shuffle128 = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	// crc32
 	const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
 	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
 	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
 	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
 	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); // low 32 bits

 	// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
 	// https://web.archive.org/web/20230315165408/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

 	// calculates m=(1<<n)%P, which is 32-bit value
 	// returns 33-bit value reflect(m<<32,64) << 1
 	//
 	// uint64_t crc32_pow2mod(size_t n)
 	// {
 	// 	uint32_t mod = CRC32_POLY;
 	// 	for (size_t i = 0; i < n - 32; i++)
 	// 	{
 	// 		mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
 	// 	}
 	// 	// bits are already reflected
 	// 	return (uint64_t)mod << 1;
 	// }

 	// calculates d=(1<<64)/P, which is 33-bit value (65-32=33)
 	// returns 33-bit value reflect(d,33)
 	//
 	// uint64_t crc32_2pow64div(void)
 	// {
 	// 	uint64_t div = 1;
 	// 	uint32_t mod = CRC32_POLY;
 	// 	for (size_t i = 0; i < 32; i++)
 	// 	{
 	// 		div |= (mod&1ULL) << (i+1);
 	// 		mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
 	// 	}
 	// 	// bits are already reflected
 	// 	return div;
 	// }

 	// k1 = crc32_pow2mod(4*128+32)
 	// k2 = crc32_pow2mod(4*128-32
 	// k3 = crc32_pow2mod(128+32)
 	// k4 = crc32_pow2mod(128-32)
 	// k5 = crc32_pow2mod(64)
 	// P = ((uint64_t)CRC32_POLY << 1) | 1
 	// u = crc32_2pow64div()

 	// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
 	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
 	__m128i x0 = _mm_cvtsi32_si128(crc);

 	if (size >= 64)
 	{
 		const __m256i shuffle256 = _mm256_broadcastsi128_si256(shuffle128);

 		// adler32
 		const __m256i cmul = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 		const __m256i ones = _mm256_set1_epi16(1);
 		const __m256i zero = _mm256_setzero_si256();

 		// crc32
 		__m128i x1 = _mm_setzero_si128();
 		__m128i x2 = _mm_setzero_si128();
 		__m128i x3 = _mm_setzero_si128();

 		while (size >= 64)
 		{
 			__m256i vs = zero;
 			__m256i va = _mm256_zextsi128_si256(_mm_cvtsi32_si128(a));
 			__m256i vb = _mm256_zextsi128_si256(_mm_cvtsi32_si128(b));

 			// process as many 64-byte blocks as possible
 			size_t block_count = size / 64;
 			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

 			for (size_t i = 0; i < block_count; i++)
 			{
 				// pixel filtering
 				__m256i vlast0 = _mm256_loadu_si256((const __m256i*)last + 0);
 				__m256i vlast1 = _mm256_loadu_si256((const __m256i*)last + 1);
 				__m256i vsrc0 = _mm256_loadu_si256((const __m256i*)src + 0);
 				__m256i vsrc1 = _mm256_loadu_si256((const __m256i*)src + 1);
 				__m256i vdst0 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc0, vlast0), shuffle256);
 				__m256i vdst1 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc1, vlast1), shuffle256);
 				_mm256_storeu_si256((__m256i*)dst + 0, vdst0);
 				_mm256_storeu_si256((__m256i*)dst + 1, vdst1);
 				last += inc * 4;
 				src += 64;
 				dst += 64;
 				size -= 64;

 				// adler32 update
 				vs = _mm256_add_epi32(vs, va);
 				va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst0, zero));
 				vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst0, cmul), ones));
 				vs = _mm256_add_epi32(vs, va);
 				va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst1, zero));
 				vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst1, cmul), ones));

 				// crc32 update
 				x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
 				x0 = _mm_xor_si128(x0, _mm256_castsi256_si128(vdst0));
 				x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
 				x1 = _mm_xor_si128(x1, _mm256_extracti128_si256(vdst0, 1));
 				x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
 				x2 = _mm_xor_si128(x2, _mm256_castsi256_si128(vdst1));
 				x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
 				x3 = _mm_xor_si128(x3, _mm256_extracti128_si256(vdst1, 1));
 				crc_mul = k1k2;
 			}

 			vb = _mm256_add_epi32(vb, _mm256_slli_epi32(vs, 5));

 			// a = sum(va)
 			__m128i asum = _mm_add_epi32(_mm256_castsi256_si128(va), _mm256_extracti128_si256(va, 1));
 			asum = _mm_hadd_epi32(asum, asum);
 			asum = _mm_hadd_epi32(asum, asum);
 			a = _mm_cvtsi128_si32(asum);

 			// b = sum(vb)
 			__m128i bsum = _mm_add_epi32(_mm256_castsi256_si128(vb), _mm256_extracti128_si256(vb, 1));
 			bsum = _mm_hadd_epi32(bsum, bsum);
 			bsum = _mm_hadd_epi32(bsum, bsum);
 			b = _mm_cvtsi128_si32(bsum);

 			a %= UPNG__ADLER32_MOD;
 			b %= UPNG__ADLER32_MOD;
 		}

 		// reduce 512-bit to 128-bit
 		x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		crc_mul = k3k4;
 	}

 	if (size >= 16)
 	{
 		const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 		const __m128i ones = _mm_set1_epi16(1);
 		const __m128i zero = _mm_setzero_si128();

 		__m128i vs = zero;
 		__m128i va = _mm_cvtsi32_si128(a);
 		__m128i vb = _mm_cvtsi32_si128(b);

 		// only 1 to 3 iterations
 		while (size >= 16)
 		{
 			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
 			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
 			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle128);
 			_mm_storeu_si128((__m128i*)dst, vdst);
 			last += inc;
 			src += 16;
 			dst += 16;
 			size -= 16;

 			// adler32 update
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

 			// crc32 update
 			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
 			x0 = _mm_xor_si128(x0, vdst);
 			crc_mul = k3k4;
 		}

 		// vb += vs * 16
 		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 		// a = sum(va)
 		va = _mm_hadd_epi32(va, va);
 		va = _mm_hadd_epi32(va, va);
 		a = _mm_cvtsi128_si32(va);

 		// b = sum(vb)
 		vb = _mm_hadd_epi32(vb, vb);
 		vb = _mm_hadd_epi32(vb, vb);
 		b = _mm_cvtsi128_si32(vb);

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	idat->adler = a | (b << 16);

 	// reduce 128-bit to 96-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
 	// reduce 96-bit to 64-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
 	// reduce 64-bit to 32-bit
 	__m128i x1;
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
 	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
 	idat->crc = ~crc;

 	return dst - out;
 }

 #else // UPNG__ARCH_X64_AVX2

 static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
 upng__row1_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	if (size < 16)
 	{
 		return 0;
 	}

 	uint8_t* out = dst;
 	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	// adler32
 	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 	const __m128i ones = _mm_set1_epi16(1);
 	const __m128i zero = _mm_setzero_si128();

 	// crc32
 	const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
 	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
 	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
 	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
 	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);

 	// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
 	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
 	__m128i x0 = _mm_cvtsi32_si128(crc);

 	if (size >= 64)
 	{
 		__m128i x1 = zero;
 		__m128i x2 = zero;
 		__m128i x3 = zero;

 		while (size >= 64)
 		{
 			__m128i vs = zero;
 			__m128i va = _mm_cvtsi32_si128(a);
 			__m128i vb = _mm_cvtsi32_si128(b);

 			// process as many 64-byte blocks as possible
 			size_t block_count = size / 64;
 			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

 			for (size_t i = 0; i < block_count; i++)
 			{
 				// pixel filtering
 				__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
 				__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
 				__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
 				__m128i vlast3 = _mm_loadu_si128((const __m128i*)last + 3);
 				__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
 				__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
 				__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
 				__m128i vsrc3 = _mm_loadu_si128((const __m128i*)src + 3);
 				__m128i vdst0 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc0, vlast0), shuffle);
 				__m128i vdst1 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc1, vlast1), shuffle);
 				__m128i vdst2 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc2, vlast2), shuffle);
 				__m128i vdst3 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc3, vlast3), shuffle);
 				_mm_storeu_si128((__m128i*)dst + 0, vdst0);
 				_mm_storeu_si128((__m128i*)dst + 1, vdst1);
 				_mm_storeu_si128((__m128i*)dst + 2, vdst2);
 				_mm_storeu_si128((__m128i*)dst + 3, vdst3);
 				last += inc * 4;
 				src += 64;
 				dst += 64;
 				size -= 64;

 				// adler32 update
 				vs = _mm_add_epi32(vs, va);
 				va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
 				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
 				vs = _mm_add_epi32(vs, va);
 				va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
 				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
 				vs = _mm_add_epi32(vs, va);
 				va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
 				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));
 				vs = _mm_add_epi32(vs, va);
 				va = _mm_add_epi32(va, _mm_sad_epu8(vdst3, zero));
 				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst3, cmul), ones));

 				// crc32 update
 				x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
 				x0 = _mm_xor_si128(x0, vdst0);
 				x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
 				x1 = _mm_xor_si128(x1, vdst1);
 				x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
 				x2 = _mm_xor_si128(x2, vdst2);
 				x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
 				x3 = _mm_xor_si128(x3, vdst3);
 				crc_mul = k1k2;
 			}

 			vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 			// a = sum(va)
 			va = _mm_hadd_epi32(va, va);
 			va = _mm_hadd_epi32(va, va);
 			a = _mm_cvtsi128_si32(va);

 			// b = sum(vb)
 			vb = _mm_hadd_epi32(vb, vb);
 			vb = _mm_hadd_epi32(vb, vb);
 			b = _mm_cvtsi128_si32(vb);

 			a %= UPNG__ADLER32_MOD;
 			b %= UPNG__ADLER32_MOD;
 		}
 	
 		// reduce 512-bit to 128-bit
 		x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 		crc_mul = k3k4;
 	}

 	if (size >= 16)
 	{
 		__m128i vs = zero;
 		__m128i va = _mm_cvtsi32_si128(a);
 		__m128i vb = _mm_cvtsi32_si128(b);

 		// only 1 to 3 iterations
 		while (size >= 16)
 		{
 			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
 			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
 			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
 			_mm_storeu_si128((__m128i*)dst, vdst);
 			last += inc;
 			src += 16;
 			dst += 16;
 			size -= 16;

 			// adler32 update
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

 			// crc32 update
 			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
 			x0 = _mm_xor_si128(x0, vdst);
 			crc_mul = k3k4;
 		}

 		// vb += vs * 16
 		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 		// a = sum(va)
 		va = _mm_hadd_epi32(va, va);
 		va = _mm_hadd_epi32(va, va);
 		a = _mm_cvtsi128_si32(va);

 		// b = sum(vb)
 		vb = _mm_hadd_epi32(vb, vb);
 		vb = _mm_hadd_epi32(vb, vb);
 		b = _mm_cvtsi128_si32(vb);

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	idat->adler = a | (b << 16);

 	// reduce 128-bit to 96-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
 	// reduce 96-bit to 64-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
 	// reduce 64-bit to 32-bit
 	__m128i x1;
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
 	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
 	idat->crc = ~crc;

 	return dst - out;
 }

 #endif // UPNG__ARCH_X64_AVX2

 static size_t UPNG__TARGET("ssse3,sse4.1")
 upng__row3_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
 {
 	uint8_t* out = dst;
 	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
 	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
 	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
 	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
 	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
 	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
 	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	// adler32
 	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 	const __m128i ones = _mm_set1_epi16(1);
 	const __m128i zero = _mm_setzero_si128();

 	while (size >= 48)
 	{
 		__m128i vs = zero;
 		__m128i va = _mm_cvtsi32_si128(a);
 		__m128i vb = _mm_cvtsi32_si128(b);

 		size_t block_count = size / 48;
 		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

 		for (size_t i = 0; i < block_count; i++)
 		{
 			// pixel filtering
 			__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
 			__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
 			__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
 			__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
 			__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
 			__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
 			__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
 			__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
 			__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
 			__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
 			__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
 			__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
 			_mm_storeu_si128((__m128i*)dst + 0, vdst0);
 			_mm_storeu_si128((__m128i*)dst + 1, vdst1);
 			_mm_storeu_si128((__m128i*)dst + 2, vdst2);
 			last += inc;
 			src += 48;
 			dst += 48;
 			size -= 48;

 			// adler32 update
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));

 			// crc32 update
 			uint32_t b0 = _mm_extract_epi32(vdst0, 0) ^ crc;
 			uint32_t b1 = _mm_extract_epi32(vdst0, 1);
 			uint32_t b2 = _mm_extract_epi32(vdst0, 2);
 			uint32_t b3 = _mm_extract_epi32(vdst0, 3);
 			UPNG__MSVC_BARRIER();
 			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];

 			b0 = _mm_extract_epi32(vdst1, 0) ^ crc;
 			b1 = _mm_extract_epi32(vdst1, 1);
 			b2 = _mm_extract_epi32(vdst1, 2);
 			b3 = _mm_extract_epi32(vdst1, 3);
 			UPNG__MSVC_BARRIER();
 			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];

 			b0 = _mm_extract_epi32(vdst2, 0) ^ crc;
 			b1 = _mm_extract_epi32(vdst2, 1);
 			b2 = _mm_extract_epi32(vdst2, 2);
 			b3 = _mm_extract_epi32(vdst2, 3);
 			UPNG__MSVC_BARRIER();
 			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
 			UPNG__MSVC_BARRIER();
 			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
 		}

 		// vb += vs * 16
 		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 		// a = sum(va)
 		va = _mm_hadd_epi32(va, va);
 		va = _mm_hadd_epi32(va, va);
 		a = _mm_cvtsi128_si32(va);

 		// b = sum(vb)
 		vb = _mm_hadd_epi32(vb, vb);
 		vb = _mm_hadd_epi32(vb, vb);
 		b = _mm_cvtsi128_si32(vb);

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	idat->adler = a | (b << 16);
 	idat->crc = ~crc;

 	return dst - out;
 }

 static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
 upng__row3_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t (*shuffle)[16])
 {
 	if (size < 48)
 	{
 		return 0;
 	}

 	uint8_t* out = dst;
 	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
 	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
 	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
 	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
 	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
 	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
 	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	// adler32
 	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 	const __m128i ones = _mm_set1_epi16(1);
 	const __m128i zero = _mm_setzero_si128();

 	// crc32
 	// k1 = crc32_pow2mod(3*128+32)
 	// k2 = crc32_pow2mod(3*128-32)
 	const __m128i k1k2 = _mm_setr_epi32(0x3db1ecdc, 0, 0x74359406, 1);
 	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
 	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
 	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
 	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);

 	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
 	__m128i x0 = _mm_cvtsi32_si128(crc);
 	__m128i x1 = zero;
 	__m128i x2 = zero;

 	while (size >= 48)
 	{
 		__m128i vs = zero;
 		__m128i va = _mm_cvtsi32_si128(a);
 		__m128i vb = _mm_cvtsi32_si128(b);

 		// process as many 3x16-byte blocks as possible
 		size_t block_count = size / 48;
 		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

 		for (size_t i = 0; i < block_count; i++)
 		{
 			// pixel filtering
 			__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
 			__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
 			__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
 			__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
 			__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
 			__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
 			__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
 			__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
 			__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
 			__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
 			__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
 			__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
 			_mm_storeu_si128((__m128i*)dst + 0, vdst0);
 			_mm_storeu_si128((__m128i*)dst + 1, vdst1);
 			_mm_storeu_si128((__m128i*)dst + 2, vdst2);
 			last += inc;
 			src += 48;
 			dst += 48;
 			size -= 48;

 			// adler32 update
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
 			vs = _mm_add_epi32(vs, va);
 			va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
 			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));

 			// crc32 update
 			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
 			x0 = _mm_xor_si128(x0, vdst0);
 			x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
 			x1 = _mm_xor_si128(x1, vdst1);
 			x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
 			x2 = _mm_xor_si128(x2, vdst2);
 			crc_mul = k1k2;
 		}

 		// vb += vs * 16
 		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

 		// a = sum(va)
 		va = _mm_hadd_epi32(va, va);
 		va = _mm_hadd_epi32(va, va);
 		a = _mm_cvtsi128_si32(va);

 		// b = sum(vb)
 		vb = _mm_hadd_epi32(vb, vb);
 		vb = _mm_hadd_epi32(vb, vb);
 		b = _mm_cvtsi128_si32(vb);

 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 	idat->adler = a | (b << 16);

 	// reduce 384-bit to 128-bit
 	x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 	x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
 	// reduce 128-bit to 96-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
 	// reduce 96-bit to 64-bit
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
 	// reduce 64-bit to 32-bit
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
 	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
 	idat->crc = ~crc;

 	return dst - out;
 }

 static size_t UPNG__TARGET("ssse3,sse4.1")
 upng__unrow1_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	uint8_t* out = dst;
 	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

 	while (size >= 16)
 	{
 		__m128i vlast = _mm_loadu_si128((const __m128i*)last);
 		__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
 		__m128i vdst = _mm_shuffle_epi8(vsrc, shuffle);
 		_mm_storeu_si128((__m128i*)dst, _mm_add_epi8(vdst, vlast));
 		last += inc;
 		src += 16;
 		dst += 16;
 		size -= 16;
 	}

 	return dst - out;
 }

 static size_t UPNG__TARGET("ssse3,sse4.1")
 upng__unrow3_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
 {
 	uint8_t* out = dst;
 	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
 	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
 	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
 	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
 	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
 	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
 	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

 	while (size >= 48)
 	{
 		__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
 		__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
 		__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
 		__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
 		__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
 		__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
 		__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(vsrc0, s00), _mm_shuffle_epi8(vsrc1, s01));
 		__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vsrc0, s10), _mm_shuffle_epi8(vsrc1, s11)), _mm_shuffle_epi8(vsrc2, s12));
 		__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(vsrc1, s21), _mm_shuffle_epi8(vsrc2, s22));
 		_mm_storeu_si128((__m128i*)dst + 0, _mm_add_epi8(vdst0, vlast0));
 		_mm_storeu_si128((__m128i*)dst + 1, _mm_add_epi8(vdst1, vlast1));
 		_mm_storeu_si128((__m128i*)dst + 2, _mm_add_epi8(vdst2, vlast2));
 		last += inc;
 		src += 48;
 		dst += 48;
 		size -= 48;
 	}

 	return dst - out;
 }

 #endif

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #if defined(UPNG__ARCH_ARM64)

 static size_t upng__row1_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	if (size < 16)
 	{
 		return 0;
 	}

 	uint8_t* out = dst;
 	const uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
 	const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
 	const uint32x4_t zero = vdupq_n_u32(0);

 #if defined(UPNG__ARM64_CRYPTO)
 	const poly64x2_t k1k2 = { 0x154442bd4, 0x1c6e41596 };
 	const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
 	const poly64_t k5 = { 0x163cd6124 };
 	const poly64_t poly_u = { 0x0f7011641 };
 	const poly64_t poly_p = { 0x1db710641 };
 	const uint64x2_t mask32 = { ~0U, 0 };

 	poly64x2_t crc_mul = { 1, 0 };
 	poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));

 #define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
 #define UPNG__CLMUL_P128(x,k,value) do {                                                          \
 	poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
 	poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k);                                  \
 	x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value));                 \
 } while (0)
 	
 #endif

 	if (size >= 64)
 	{
 #if defined(UPNG__ARM64_CRYPTO)
 		poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
 		poly128_t x2 = x1;
 		poly128_t x3 = x1;
 #endif
 		while (size >= 64)
 		{
 			uint32x4_t va = vsetq_lane_u32(a, zero, 0);
 			uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
 			uint32x4_t vs = zero;

 			// process as many 64-byte blocks as possible
 			size_t block_count = size / 64;
 			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

 			for (size_t i = 0; i < block_count; i++)
 			{
 				// pixel filtering
 				uint8x16x4_t vlast = vld1q_u8_x4(last);
 				uint8x16x4_t vsrc = vld1q_u8_x4(src);
 				uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
 				uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
 				uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
 				uint8x16_t v3 = vsubq_u8(vsrc.val[3], vlast.val[3]);
 				uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
 				uint8x16_t vdst1 = vqtbl1q_u8(v1, shuffle);
 				uint8x16_t vdst2 = vqtbl1q_u8(v2, shuffle);
 				uint8x16_t vdst3 = vqtbl1q_u8(v3, shuffle);
 				uint8x16x4_t vdst = { vdst0, vdst1, vdst2, vdst3 };
 				vst1q_u8_x4(dst, vdst);
 				last += inc * 4;
 				src += 64;
 				dst += 64;
 				size -= 64;

 				// these could use vdotq_u32, but it runs ~2% slower
 				uint16x8_t t0, t1, t2, t3;
 				vs = vaddq_u32(vs, va);
 				va = vpadalq_u16(va, vpaddlq_u8(vdst0));
 				vs = vaddq_u32(vs, va);
 				va = vpadalq_u16(va, vpaddlq_u8(vdst1));
 				vs = vaddq_u32(vs, va);
 				va = vpadalq_u16(va, vpaddlq_u8(vdst2));
 				vs = vaddq_u32(vs, va);
 				va = vpadalq_u16(va, vpaddlq_u8(vdst3));
 				t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
 				t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
 				t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
 				t3 = vmull_u8(vget_low_u8(vdst3), vget_low_u8(cmul));
 				t0 = vmlal_high_u8(t0, vdst0, cmul);
 				t1 = vmlal_high_u8(t1, vdst1, cmul);
 				t2 = vmlal_high_u8(t2, vdst2, cmul);
 				t3 = vmlal_high_u8(t3, vdst3, cmul);
 				vb = vpadalq_u16(vb, t0);
 				vb = vpadalq_u16(vb, t1);
 				vb = vpadalq_u16(vb, t2);
 				vb = vpadalq_u16(vb, t3);

 #if defined(UPNG__ARM64_CRYPTO)
 				UPNG__CLMUL_P128(x0, crc_mul, vdst0);
 				UPNG__CLMUL_P128(x1, crc_mul, vdst1);
 				UPNG__CLMUL_P128(x2, crc_mul, vdst2);
 				UPNG__CLMUL_P128(x3, crc_mul, vdst3);
 				crc_mul = k1k2;
 #elif defined(UPNG__ARM64_CRC32)
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 0));
 				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 1));
 #else
 				uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
 				uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 0) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 1);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 2) ^ crc;
 				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 3);
 				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 #endif
 			}

 			vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
 			a = vaddvq_u32(va);
 			b = vaddvq_u32(vb);
 			a %= UPNG__ADLER32_MOD;
 			b %= UPNG__ADLER32_MOD;
 		}

 #if defined(UPNG__ARM64_CRYPTO)
 		// reduce 512-bit to 128-bit
 		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
 		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
 		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x3));
 		crc_mul = k3k4;
 #endif
 	}

 	if (size >= 16)
 	{
 		uint32x4_t va = vsetq_lane_u32(a, zero, 0);
 		uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
 		uint32x4_t vs = zero;

 		// only 1 to 3 iterations
 		while (size >= 16)
 		{
 			uint8x16_t v0 = vsubq_u8(vld1q_u8(src), vld1q_u8(last));
 			uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
 			vst1q_u8(dst, vdst0);
 			last += inc;
 			src += 16;
 			dst += 16;
 			size -= 16;

 			uint16x8_t t0;
 			vs = vaddq_u32(vs, va);
 			va = vpadalq_u16(va, vpaddlq_u8(vdst0));
 			t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
 			t0 = vmlal_high_u8(t0, vdst0, cmul);
 			vb = vpadalq_u16(vb, t0);

 #if defined(UPNG__ARM64_CRYPTO)
 			UPNG__CLMUL_P128(x0, crc_mul, vdst0);
 			crc_mul = k3k4;
 #elif defined(UPNG__ARM64_CRC32)
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
 #else
 			uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
 			uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 #endif
 		}

 		vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
 		a = vaddvq_u32(va);
 		b = vaddvq_u32(vb);
 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 #if defined(UPNG__ARM64_CRYPTO)
 	// reduce 128-bit to 96-bit
 	poly128_t p0;
 	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
 	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
 	// reduce 96-bit to 64-bit
 	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
 	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
 	// reduce 64-bit to 32-bit
 	poly128_t x1;
 	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
 	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
 	crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
 	#undef UPNG__CLADD_P128
 	#undef UPNG__CLMUL_P128
 #endif

 	idat->adler = a | (b << 16);
 	idat->crc = ~crc;

 	return dst - out;
 }

 static size_t upng__row3_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
 {
 	if (size < 48)
 	{
 		return 0;
 	}

 	uint8_t* out = dst;
 	const uint8x16_t s00 = vld1q_u8(shuffle[0]);
 	const uint8x16_t s01 = vld1q_u8(shuffle[1]);
 	const uint8x16_t s10 = vld1q_u8(shuffle[2]);
 	const uint8x16_t s11 = vld1q_u8(shuffle[3]);
 	const uint8x16_t s12 = vld1q_u8(shuffle[4]);
 	const uint8x16_t s21 = vld1q_u8(shuffle[5]);
 	const uint8x16_t s22 = vld1q_u8(shuffle[6]);

 	uint32_t a = idat->adler & 0xffff;
 	uint32_t b = idat->adler >> 16;
 	uint32_t crc = ~idat->crc;

 	const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
 	const uint32x4_t zero = vdupq_n_u32(0);

 #if defined(UPNG__ARM64_CRYPTO)
 	const poly64x2_t k1k2 = { 0x03db1ecdc, 0x174359406 };
 	const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
 	const poly64_t k5 = { 0x163cd6124 };
 	const poly64_t poly_u = { 0x0f7011641 };
 	const poly64_t poly_p = { 0x1db710641 };
 	const uint64x2_t mask32 = { ~0U, 0 };

 	poly64x2_t crc_mul = { 1, 0 };
 	poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
 	poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
 	poly128_t x2 = x1;

 #define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
 #define UPNG__CLMUL_P128(x,k,value) do {                                                          \
 	poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
 	poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k);                                  \
 	x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value));                 \
 } while (0)
 	
 #endif

 	while (size >= 48)
 	{
 		uint32x4_t va = vsetq_lane_u32(a, zero, 0);
 		uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
 		uint32x4_t vs = zero;

 		// process as many 3x16-byte blocks as possible
 		size_t block_count = size / 48;
 		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

 		for (size_t i = 0; i < block_count; i++)
 		{
 			uint8x16x3_t vlast = vld1q_u8_x3(last);
 			uint8x16x3_t vsrc = vld1q_u8_x3(src);
 			uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
 			uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
 			uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
 			uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(v0, s00), v1, s01);
 			uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(v0, s10), v1, s11), v2, s12);
 			uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(v1, s21), v2, s22);
 			uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
 			vst1q_u8_x3(dst, vdst);
 			last += inc;
 			src += 48;
 			dst += 48;
 			size -= 48;

 			vs = vaddq_u32(vs, va);
 			va = vpadalq_u16(va, vpaddlq_u8(vdst0));
 			vs = vaddq_u32(vs, va);
 			va = vpadalq_u16(va, vpaddlq_u8(vdst1));
 			vs = vaddq_u32(vs, va);
 			va = vpadalq_u16(va, vpaddlq_u8(vdst2));

 			uint16x8_t t0, t1, t2;
 			t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
 			t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
 			t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
 			t0 = vmlal_high_u8(t0, vdst0, cmul);
 			t1 = vmlal_high_u8(t1, vdst1, cmul);
 			t2 = vmlal_high_u8(t2, vdst2, cmul);
 			vb = vpadalq_u16(vb, t0);
 			vb = vpadalq_u16(vb, t1);
 			vb = vpadalq_u16(vb, t2);

 #if defined(UPNG__ARM64_CRYPTO)
 			UPNG__CLMUL_P128(x0, crc_mul, vdst0);
 			UPNG__CLMUL_P128(x1, crc_mul, vdst1);
 			UPNG__CLMUL_P128(x2, crc_mul, vdst2);
 			crc_mul = k1k2;
 #elif defined(UPNG__ARM64_CRC32)
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
 			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
 #else
 			uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
 			uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
 			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
 			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
 			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
 #endif
 		}

 		vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
 		a = vaddvq_u32(va);
 		b = vaddvq_u32(vb);
 		a %= UPNG__ADLER32_MOD;
 		b %= UPNG__ADLER32_MOD;
 	}

 #if defined(UPNG__ARM64_CRYPTO)
 	// reduce 384-bit to 128-bit
 	UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
 	UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
 	// reduce 128-bit to 96-bit
 	poly128_t p0;
 	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
 	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
 	// reduce 96-bit to 64-bit
 	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
 	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
 	// reduce 64-bit to 32-bit
 	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
 	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
 	crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
 	#undef UPNG__CLMUL_P128
 	#undef UPNG__CLADD_P128
 #endif

 	idat->adler = a | (b << 16);
 	idat->crc = ~crc;

 	return dst - out;
 }

 static size_t upng__unrow1_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
 {
 	uint8_t* out = dst;

 	uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
 	const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));

 	while (size >= 16)
 	{
 		uint8x16_t vdst0 = vqtbl1q_u8(vld1q_u8(src), shuffle);
 		vdst0 = vaddq_u8(vdst0, vld1q_u8(last));
 		vst1q_u8(dst, vdst0);
 		last += inc;
 		src += 16;
 		dst += 16;
 		size -= 16;
 	}

 	return dst - out;
 }

 static size_t upng__unrow3_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
 {
 	uint8_t* out = dst;
 	const uint8x16_t s00 = vld1q_u8(shuffle[0]);
 	const uint8x16_t s01 = vld1q_u8(shuffle[1]);
 	const uint8x16_t s10 = vld1q_u8(shuffle[2]);
 	const uint8x16_t s11 = vld1q_u8(shuffle[3]);
 	const uint8x16_t s12 = vld1q_u8(shuffle[4]);
 	const uint8x16_t s21 = vld1q_u8(shuffle[5]);
 	const uint8x16_t s22 = vld1q_u8(shuffle[6]);

 	while (size >= 48)
 	{
 		uint8x16x3_t vsrc = vld1q_u8_x3(src);
 		uint8x16_t vsrc0 = vsrc.val[0];
 		uint8x16_t vsrc1 = vsrc.val[1];
 		uint8x16_t vsrc2 = vsrc.val[2];
 		uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(vsrc0, s00), vsrc1, s01);
 		uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(vsrc0, s10), vsrc1, s11), vsrc2, s12);
 		uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(vsrc1, s21), vsrc2, s22);
 		uint8x16x3_t vlast = vld1q_u8_x3(last);
 		vdst0 = vaddq_u8(vdst0, vlast.val[0]);
 		vdst1 = vaddq_u8(vdst1, vlast.val[1]);
 		vdst2 = vaddq_u8(vdst2, vlast.val[2]);
 		uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
 		vst1q_u8_x3(dst, vdst);
 		last += inc;
 		src += 48;
 		dst += 48;
 		size -= 48;
 	}

 	return dst - out;
 }

 #endif

 #define _ 0xff
 #define __ _

 // identity shuffle, nothing to rearrange
 static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB8[7][16]) =
 {
 	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
 	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},

 	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
 	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
 	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},

 	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
 	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
 };

 //         0123456789012345
 // src0 = [BGRBGRBGRBGRBGRB]
 // src1 = [GRBGRBGRBGRBGRBG]
 // src2 = [RBGRBGRBGRBGRBGR]
 static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR8[7][16]) =
 {
 	{ 2,1,0, 5,4,3, 8,7,6, 11,10,9, 14,13,12, _},  // RGB RGB RGB RGB RGB _
 	{ _,_,_, _,_,_, _,_,_, __,__,_, __,__,__, 1},  // ___ ___ ___ ___ ___ R

 	{ _,15, _,_,_, _,_,_, __,_,_, __,__,__, _,__}, // _B ___ ___ ___ ___ __
 	{ 0,__, 4,3,2, 7,6,5, 10,9,8, 13,12,11, _,15}, // G_ RGB RGB RGB RGB _G
 	{ _,__, _,_,_, _,_,_, __,_,_, __,__,__, 0,__}, // __ ___ ___ ___ ___ R_

 	{14, _,_,_, _,_,_, _,_,_, __,__,__, __,__,__}, // B ___ ___ ___ ___ ___
 	{__, 3,2,1, 6,5,4, 9,8,7, 12,11,10, 15,14,13}, // _ RGB RGB RGB RGB RGB
 };

 // only swap bytes in each 16-bit value
 static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB16[7][16]) =
 {
 	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
 	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},

 	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
 	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
 	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},

 	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
 	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
 };

 //         0123456789012345
 // src0 = [BBGGRRBBGGRRBBGG]
 // src1 = [RRBBGGRRBBGGRRBB]
 // src2 = [GGRRBBGGRRBBGGRR]
 static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR16[7][16]) =
 {
 	{5,4,3,2,1,0, 11,10,9,8,7,6, _,_,15,14},      // RRGGBB RRGGBB __GG
 	{_,_,_,_,_,_, __,__,_,_,_,_, 1,0,__,__},      // ______ ______ RR__

 	{13,12, _,_,_,_,_,_, __,__,__,__,_,_, _,_},   // BB ______ ______ __
 	{__,__, 7,6,5,4,3,2, 13,12,11,10,9,8, _,_},   // __ RRGGBB RRGGBB __
 	{__,__, _,_,_,_,_,_, __,__,__,__,_,_, 3,2},   // __ ______ ______ RR

 	{_,_, 15,14, _,_,_,_,_,_, __,__,__,__,__,__}, // __BB ______ ______
 	{1,0, __,__, 9,8,7,6,5,4, 15,14,13,12,11,10}, // GG__ RRGGBB RRGGBB
 };

 #undef _
 #undef __

 // handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
 static size_t upng__row1(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
 {
 	uint64_t shuffle64;
 	switch (format)
 	{
 	case UPNG_FORMAT_G8:
 	case UPNG_FORMAT_GA8:
 	case UPNG_FORMAT_RGBA8:
 		shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
 		break;
 	case UPNG_FORMAT_BGRA8:
 		shuffle64 = 0x0704050603000102; // BGRA to RGBA
 		break;
 	case UPNG_FORMAT_G16:
 	case UPNG_FORMAT_GA16:
 	case UPNG_FORMAT_RGBA16:
 		shuffle64 = 0x0607040502030001; // swap bytes
 		break;
 	case UPNG_FORMAT_BGRA16:
 		shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
 		break;
 	default:
 		shuffle64 = 0;
 		break;
 	};

 #if defined(UPNG__ARCH_X64_AVX2)
 	int cpuid = upng__cpuid();
 	if (cpuid & UPNG__CPUID_CLMUL)
 	{
 		return upng__row1_avx2(idat, dst, src, last, size, inc, shuffle64);
 	}
 	else
 	{
 		return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
 	}
 #elif defined(UPNG__ARCH_X64)
 	int cpuid = upng__cpuid();
 	if (cpuid & UPNG__CPUID_CLMUL)
 	{
 		return upng__row1_clmul(idat, dst, src, last, size, inc, shuffle64);
 	}
 	else if (cpuid & UPNG__CPUID_SSE41)
 	{
 		return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
 	}
 #elif defined(UPNG__ARCH_ARM64)
 	return upng__row1_arm64(idat, dst, src, last, size, inc, shuffle64);
 #else
 	(void)shuffle64;
 #endif
 	return 0;
 }

 // handles RGB8, BGR8, RGB16, BGR16
 static size_t upng__row3(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
 {
 	const uint8_t (*shuffle)[16];
 	switch (format)
 	{
 	case UPNG_FORMAT_RGB8:
 		shuffle = upng__shuffle_RGB8;
 		break;
 	case UPNG_FORMAT_BGR8:
 		shuffle = upng__shuffle_BGR8;
 		break;
 	case UPNG_FORMAT_RGB16:
 		shuffle = upng__shuffle_RGB16;
 		break;
 	case UPNG_FORMAT_BGR16:
 		shuffle = upng__shuffle_BGR16;
 		break;
 	default:
 		shuffle = NULL;
 		break;
 	}

 #if defined(UPNG__ARCH_X64)
 	int cpuid = upng__cpuid();
 	if (cpuid & UPNG__CPUID_CLMUL)
 	{
 		return upng__row3_clmul(idat, dst, src, last, size, inc, shuffle);
 	}
 	else if (cpuid & UPNG__CPUID_SSE41)
 	{
 		return upng__row3_sse4(idat, dst, src, last, size, inc, shuffle);
 	}
 #elif defined(UPNG__ARCH_ARM64)
 	return upng__row3_arm64(idat, dst, src, last, size, inc, shuffle);
 #else
 	(void)shuffle;
 #endif
 	return 0;
 }

 static void upng__row(upng__idat* idat, uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
 {
 	// NONE filter is same as UP when previous row is all 0 values
 	static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
 	const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : src - pitch;

 	size_t inc;
 	size_t used;
 	if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
 	{
 		inc = filter == UPNG_FILTER_NONE ? 0 : 48;
 		used = upng__row3(idat, dst, src, last, size, inc, format);
 	}
 	else
 	{
 		inc = filter == UPNG_FILTER_NONE ? 0 : 16;
 		used = upng__row1(idat, dst, src, last, size, inc, format);
 	}

 	last += inc == 0 ? 0 : used;
 	src += used;
 	dst += used;
 	size -= used;

 	uint8_t* tail = dst;
 	size_t tail_size = size;

 	// size < inc
 	switch (format)
 	{
 	case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
 		inc /= 16; // 0 or 1
 		while (size != 0)
 		{
 			dst[0] = src[0] - last[0];
 			last += inc;
 			src += 1;
 			dst += 1;
 			size -= 1;
 		}
 		break;

 	case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
 		inc /= 8; // 0 or 2
 		while (size != 0)
 		{
 			dst[0] = src[0] - last[0];
 			dst[1] = src[1] - last[1];
 			last += inc;
 			src += 2;
 			dst += 2;
 			size -= 2;
 		}
 		break;

 	case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
 		inc /= 16; // 0 or 3
 		while (size != 0)
 		{
 			dst[0] = src[0] - last[0];
 			dst[1] = src[1] - last[1];
 			dst[2] = src[2] - last[2];
 			last += inc;
 			src += 3;
 			dst += 3;
 			size -= 3;
 		}
 		break;

 	case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
 		inc /= 16; // 0 or 3
 		while (size != 0)
 		{
 			dst[0] = src[2] - last[2];
 			dst[1] = src[1] - last[1];
 			dst[2] = src[0] - last[0];
 			last += inc;
 			src += 3;
 			dst += 3;
 			size -= 3;
 		}
 		break;

 	case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[0] - last[0];
 			dst[1] = src[1] - last[1];
 			dst[2] = src[2] - last[2];
 			dst[3] = src[3] - last[3];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;

 	case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[2] - last[2];
 			dst[1] = src[1] - last[1];
 			dst[2] = src[0] - last[0];
 			dst[3] = src[3] - last[3];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;


 	case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
 		inc /= 8; // 0 or 2
 		while (size != 0)
 		{
 			dst[0] = src[1] - last[1];
 			dst[1] = src[0] - last[0];
 			last += inc;
 			src += 2;
 			dst += 2;
 			size -= 2;
 		}
 		break;

 	case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[1] - last[1];
 			dst[1] = src[0] - last[0];
 			dst[2] = src[3] - last[3];
 			dst[3] = src[2] - last[2];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;

 	case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
 		inc /= 8; // 0 or 6
 		while (size != 0)
 		{
 			dst[0] = src[1] - last[1];
 			dst[1] = src[0] - last[0];
 			dst[2] = src[3] - last[3];
 			dst[3] = src[2] - last[2];
 			dst[4] = src[5] - last[5];
 			dst[5] = src[4] - last[4];
 			last += inc;
 			src += 6;
 			dst += 6;
 			size -= 6;
 		}
 		break;

 	case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
 		inc /= 8; // 0 or 6
 		while (size != 0)
 		{
 			dst[0] = src[5] - last[5];
 			dst[1] = src[4] - last[4];
 			dst[2] = src[3] - last[3];
 			dst[3] = src[2] - last[2];
 			dst[4] = src[1] - last[1];
 			dst[5] = src[0] - last[0];
 			last += inc;
 			src += 6;
 			dst += 6;
 			size -= 6;
 		}
 		break;

 	case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
 		inc /= 2; // 0 or 8
 		while (size != 0)
 		{
 			dst[0] = src[1] - last[1];
 			dst[1] = src[0] - last[0];
 			dst[2] = src[3] - last[3];
 			dst[3] = src[2] - last[2];
 			dst[4] = src[5] - last[5];
 			dst[5] = src[4] - last[4];
 			dst[6] = src[7] - last[7];
 			dst[7] = src[6] - last[6];
 			last += inc;
 			src += 8;
 			dst += 8;
 			size -= 8;
 		}
 		break;

 	case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
 		inc /= 2; // 0 or 8
 		while (size != 0)
 		{
 			dst[0] = src[5] - last[5];
 			dst[1] = src[4] - last[4];
 			dst[2] = src[3] - last[3];
 			dst[3] = src[2] - last[2];
 			dst[4] = src[1] - last[1];
 			dst[5] = src[0] - last[0];
 			dst[6] = src[7] - last[7];
 			dst[7] = src[6] - last[6];
 			last += inc;
 			src += 8;
 			dst += 8;
 			size -= 8;
 		}
 		break;
 	}

 	idat->adler = upng__adler32(idat->adler, tail, tail_size);
 	idat->crc = upng__crc32(idat->crc, tail, tail_size);
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 // handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
 static size_t upng__unrow1(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
 {
 	uint64_t shuffle64;
 	switch (format)
 	{
 	case UPNG_FORMAT_G8:
 	case UPNG_FORMAT_GA8:
 	case UPNG_FORMAT_RGBA8:
 		shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
 		break;
 	case UPNG_FORMAT_BGRA8:
 		shuffle64 = 0x0704050603000102; // BGRA to RGBA
 		break;
 	case UPNG_FORMAT_G16:
 	case UPNG_FORMAT_GA16:
 	case UPNG_FORMAT_RGBA16:
 		shuffle64 = 0x0607040502030001; // swap bytes
 		break;
 	case UPNG_FORMAT_BGRA16:
 		shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
 		break;
 	default:
 		shuffle64 = 0;
 		break;
 	};

 #if defined(UPNG__ARCH_X64_AVX2)
 	return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
 #elif defined(UPNG__ARCH_X64)
 	int cpuid = upng__cpuid();
 	if (cpuid & UPNG__CPUID_SSE41)
 	{
 		return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
 	}
 #elif defined(UPNG__ARCH_ARM64)
 	return upng__unrow1_arm64(dst, src, last, size, inc, shuffle64);
 #else
 	(void)shuffle64;
 #endif
 	return 0;
 }

 // handles RGB8, BGR8, RGB16, BGR16
 static size_t upng__unrow3(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
 {
 	const uint8_t(*shuffle)[16];
 	switch (format)
 	{
 	case UPNG_FORMAT_RGB8:
 		shuffle = upng__shuffle_RGB8;
 		break;
 	case UPNG_FORMAT_BGR8:
 		shuffle = upng__shuffle_BGR8;
 		break;
 	case UPNG_FORMAT_RGB16:
 		shuffle = upng__shuffle_RGB16;
 		break;
 	case UPNG_FORMAT_BGR16:
 		shuffle = upng__shuffle_BGR16;
 		break;
 	default:
 		shuffle = NULL;
 		break;
 	}

 #if defined(UPNG__ARCH_X64)
 	int cpuid = upng__cpuid();
 	if (cpuid & UPNG__CPUID_SSE41)
 	{
 		return upng__unrow3_sse4(dst, src, last, size, inc, shuffle);
 	}
 #elif defined(UPNG__ARCH_ARM64)
 	return upng__unrow3_arm64(dst, src, last, size, inc, shuffle);
 #else
 	(void)shuffle;
 #endif
 	return 0;
 }

 static void upng__unrow(uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
 {
 	// NONE filter is same as UP when previous row is all 0 values
 	static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
 	const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : dst - pitch;

 	size_t inc;
 	size_t used;
 	if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
 	{
 		inc = filter == UPNG_FILTER_NONE ? 0 : 48;
 		used = upng__unrow3(dst, src, last, size, inc, format);
 	}
 	else
 	{
 		inc = filter == UPNG_FILTER_NONE ? 0 : 16;
 		used = upng__unrow1(dst, src, last, size, inc, format);
 	}

 	last += inc == 0 ? 0 : used;
 	src += used;
 	dst += used;
 	size -= used;

 	// size < inc
 	switch (format)
 	{
 	case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
 		inc /= 16; // 0 or 1
 		while (size != 0)
 		{
 			dst[0] = src[0] + last[0];
 			last += inc;
 			src += 1;
 			dst += 1;
 			size -= 1;
 		}
 		break;

 	case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
 		inc /= 8; // 0 or 2
 		while (size != 0)
 		{
 			dst[0] = src[0] + last[0];
 			dst[1] = src[1] + last[1];
 			last += inc;
 			src += 2;
 			dst += 2;
 			size -= 2;
 		}
 		break;

 	case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
 		inc /= 16; // 0 or 3
 		while (size != 0)
 		{
 			dst[0] = src[0] + last[0];
 			dst[1] = src[1] + last[1];
 			dst[2] = src[2] + last[2];
 			last += inc;
 			src += 3;
 			dst += 3;
 			size -= 3;
 		}
 		break;

 	case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
 		inc /= 16; // 0 or 3
 		while (size != 0)
 		{
 			dst[0] = src[2] + last[0];
 			dst[1] = src[1] + last[1];
 			dst[2] = src[0] + last[2];
 			last += inc;
 			src += 3;
 			dst += 3;
 			size -= 3;
 		}
 		break;

 	case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[0] + last[0];
 			dst[1] = src[1] + last[1];
 			dst[2] = src[2] + last[2];
 			dst[3] = src[3] + last[3];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;

 	case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[2] + last[0];
 			dst[1] = src[1] + last[1];
 			dst[2] = src[0] + last[2];
 			dst[3] = src[3] + last[3];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;


 	case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
 		inc /= 8; // 0 or 2
 		while (size != 0)
 		{
 			dst[0] = src[1] + last[0];
 			dst[1] = src[0] + last[1];
 			last += inc;
 			src += 2;
 			dst += 2;
 			size -= 2;
 		}
 		break;

 	case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
 		inc /= 4; // 0 or 4
 		while (size != 0)
 		{
 			dst[0] = src[1] + last[0];
 			dst[1] = src[0] + last[1];
 			dst[2] = src[3] + last[2];
 			dst[3] = src[2] + last[3];
 			last += inc;
 			src += 4;
 			dst += 4;
 			size -= 4;
 		}
 		break;

 	case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
 		inc /= 8; // 0 or 6
 		while (size != 0)
 		{
 			dst[0] = src[1] + last[0];
 			dst[1] = src[0] + last[1];
 			dst[2] = src[3] + last[2];
 			dst[3] = src[2] + last[3];
 			dst[4] = src[5] + last[4];
 			dst[5] = src[4] + last[5];
 			last += inc;
 			src += 6;
 			dst += 6;
 			size -= 6;
 		}
 		break;

 	case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
 		inc /= 8; // 0 or 6
 		while (size != 0)
 		{
 			dst[0] = src[5] + last[0];
 			dst[1] = src[4] + last[1];
 			dst[2] = src[3] + last[2];
 			dst[3] = src[2] + last[3];
 			dst[4] = src[1] + last[4];
 			dst[5] = src[0] + last[5];
 			last += inc;
 			src += 6;
 			dst += 6;
 			size -= 6;
 		}
 		break;

 	case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
 		inc /= 2; // 0 or 8
 		while (size != 0)
 		{
 			dst[0] = src[1] + last[0];
 			dst[1] = src[0] + last[1];
 			dst[2] = src[3] + last[2];
 			dst[3] = src[2] + last[3];
 			dst[4] = src[5] + last[4];
 			dst[5] = src[4] + last[5];
 			dst[6] = src[7] + last[6];
 			dst[7] = src[6] + last[7];
 			last += inc;
 			src += 8;
 			dst += 8;
 			size -= 8;
 		}
 		break;

 	case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
 		inc /= 2; // 0 or 8
 		while (size != 0)
 		{
 			dst[0] = src[5] + last[0];
 			dst[1] = src[4] + last[1];
 			dst[2] = src[3] + last[2];
 			dst[3] = src[2] + last[3];
 			dst[4] = src[1] + last[4];
 			dst[5] = src[0] + last[5];
 			dst[6] = src[7] + last[6];
 			dst[7] = src[6] + last[7];
 			last += inc;
 			src += 8;
 			dst += 8;
 			size -= 8;
 		}
 		break;
 	}
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 static const char upng__sig[] = "\x89PNG\r\n\x1a\n";
 static const size_t upng__ihdr_size = 13;

 // max chunk size
 static const size_t upng__max_chunk_size = (1U << 31) - 1;

 static const uint32_t upng__bpp[] =
 {
 	1, // G8
 	2, // GA8
 	3, // RGB8
 	3, // BGR8
 	4, // RGBA8
 	4, // BGRA8
 	2, // G16
 	4, // GA16
 	6, // RGB16
 	6, // BGR16
 	8, // RGBA16
 	8, // BGRA16
 };

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter)
 {
 	if (width == 0 || height == 0)
 	{
 		// bad width or height
 		return 0;
 	}

 	const uint32_t bpp = upng__bpp[format];
 	if (pitch == 0)
 	{
 		pitch = width * bpp;
 	}

 	if ((size_t)width * height < (size_t)width || (size_t)width * height >= (size_t)1 << 48)
 	{
 		// width and height too big
 		return 0;
 	}

 	if ((size_t)pitch * bpp < (size_t)pitch)
 	{
 		// pitch too large, overflows size_t
 		return 0;
 	}
 	
 	static const char iend_chunk[] = "\0\0\0\0IEND\xae\x42\x60\x82";

 	// max zlib block size
 	const uint32_t max_block_size = 65535;

 	// how many pixels fit into one zlib block (conservative estimate, because of filter byte)
 	const uint32_t pixels_per_block = (max_block_size - 1) / bpp;

 	// how many full zlib blocks needed per row
 	size_t full_block_count = width / pixels_per_block;

 	// how many pixels are left
 	size_t tail_block_pixels = width % pixels_per_block;

 	// how many bytes in full zlib blocks
 	size_t full_block_size = full_block_count * (1 + 4 + pixels_per_block * bpp);

 	// how many bytes in last zlib block
 	size_t last_block_size = tail_block_pixels ? (1 + 4 + tail_block_pixels * bpp) : 0;

 	// total size per row including filter byte
 	size_t size_per_row = 1 + full_block_size + last_block_size;

 	// how many rows fit into IDAT chunk
 	size_t rows_per_idat = upng__max_chunk_size / size_per_row;
 	if (rows_per_idat == 0)
 	{
 		// code assumes it can fit at least one full row (including zlib block headers) into IDAT
 		return 0;
 	}

 	if (!dst)
 	{
 		size_t size = 0;

 		// png signature
 		size += sizeof(upng__sig) - 1;

 		// IHDR chunk
 		size += 4 + 4 + upng__ihdr_size + 4;

 		// first IDAT chunk contains 2 zlib header bytes
 		size += 4 + 4 + 2 + 4;

 		// how many full IDAT chunks
 		size_t full_idat_count = height / rows_per_idat;

 		// how many rows in last IDAT
 		size_t tail_idat_rows = height % rows_per_idat;

 		size += (4 + 4 + rows_per_idat * size_per_row + 4) * full_idat_count;
 		size += tail_idat_rows ? (4 + 4 + tail_idat_rows * size_per_row + 4) : 0;

 		// last IDAT chunk with empty zlib block & adler32
 		size += 4 + 4 + (1 + 4) + (4) + 4;

 		// IEND chunk
 		size += sizeof(iend_chunk) - 1;

 		return size;
 	}

 	upng__crc32_init();

 	uint8_t* out = (uint8_t*)dst;

 	// file signature, https://www.w3.org/TR/png/#5PNG-file-signature
 	for (size_t i = 0; i < sizeof(upng__sig) - 1; i++)
 	{
 		*out++ = (uint8_t)upng__sig[i];
 	}

 	// IHDR, https://www.w3.org/TR/png/#11IHDR
 	{
 		// https://www.w3.org/TR/png/#6Colour-values
 		static const uint8_t bits[] = { 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16 };
 		static const uint8_t type[] = { 0, 4, 2, 2, 6, 6, 0, 4, 2, 2, 6, 6 };

 		*out++ = 0;
 		*out++ = 0;
 		*out++ = 0;
 		*out++ = (uint8_t)upng__ihdr_size;
 		*out++ = (uint8_t)'I';
 		*out++ = (uint8_t)'H';
 		*out++ = (uint8_t)'D';
 		*out++ = (uint8_t)'R';
 		*out++ = (uint8_t)(width >> 24);
 		*out++ = (uint8_t)(width >> 16);
 		*out++ = (uint8_t)(width >> 8);
 		*out++ = (uint8_t)(width);
 		*out++ = (uint8_t)(height >> 24);
 		*out++ = (uint8_t)(height >> 16);
 		*out++ = (uint8_t)(height >> 8);
 		*out++ = (uint8_t)(height);
 		*out++ = bits[format];
 		*out++ = type[format];
 		*out++ = 0; // zlib compression
 		*out++ = 0; // filter method
 		*out++ = 0; // no interlace
 		uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - upng__ihdr_size - 4, upng__ihdr_size + 4);
 		*out++ = (uint8_t)(crc >> 24);
 		*out++ = (uint8_t)(crc >> 16);
 		*out++ = (uint8_t)(crc >> 8);
 		*out++ = (uint8_t)(crc);
 	}

 	// first IDAT contains just 2 bytes of zlib format
 	{
 		*out++ = 0;
 		*out++ = 0;
 		*out++ = 0;
 		*out++ = 2;
 		*out++ = (uint8_t)'I';
 		*out++ = (uint8_t)'D';
 		*out++ = (uint8_t)'A';
 		*out++ = (uint8_t)'T';
 		*out++ = 0x78; // CM=8, CINFO=7
 		*out++ = 0x01; // FCHECK=1, FDICT=0, FLEVEL=0
 		uint32_t crc = 0xec1a7ed2; // crc32(out - 6, 6)
 		*out++ = (uint8_t)(crc >> 24);
 		*out++ = (uint8_t)(crc >> 16);
 		*out++ = (uint8_t)(crc >> 8);
 		*out++ = (uint8_t)(crc);
 	}

 	upng__idat idat;
 	idat.adler = UPNG__ADLER32_INIT;

 	for (size_t y0 = 0; y0 < height; y0 += rows_per_idat)
 	{
 		size_t rows_in_idat = (height - y0) < rows_per_idat ? (height - y0) : rows_per_idat;

 		uint32_t idat_size = (uint32_t)(rows_in_idat * size_per_row);

 		// start of IDAT, https://www.w3.org/TR/png/#11IDAT
 		*out++ = (uint8_t)(idat_size >> 24);
 		*out++ = (uint8_t)(idat_size >> 16);
 		*out++ = (uint8_t)(idat_size >> 8);
 		*out++ = (uint8_t)(idat_size);
 		*out++ = (uint8_t)'I';
 		*out++ = (uint8_t)'D';
 		*out++ = (uint8_t)'A';
 		*out++ = (uint8_t)'T';
 		idat.crc = 0x35af061e; // crc32(out - 4, 4)
 		
 		for (size_t yi = 0; yi < rows_in_idat; yi++)
 		{
 			size_t y = y0 + yi;

 			// every row will always start on a new zlib block
 			for (size_t x = 0; x < width; x += pixels_per_block)
 			{
 				// how many pixels to use
 				uint32_t pixel_count = (uint32_t)((width - x) < pixels_per_block ? (width - x) : pixels_per_block);
 				uint32_t pixel_size = pixel_count * bpp;

 				// include filter byte
 				uint32_t block_size = (x == 0 ? 1 : 0) + pixel_size;

 				*out++ = 0; // BFINAL=0, BTYPE=0
 				*out++ = (uint8_t)(block_size);
 				*out++ = (uint8_t)(block_size >> 8);
 				*out++ = (uint8_t)(~block_size);
 				*out++ = (uint8_t)(~block_size >> 8);
 				idat.crc = upng__crc32(idat.crc, out - 5, 5);

 				// first row uses NONE, rest of them UP/NONE filter
 				upng_filter row_filter = y == 0 ? UPNG_FILTER_NONE : filter;
 				if (x == 0)
 				{
 					// each row starts with filter byte
 					// https://www.w3.org/TR/png/#9Filter-types
 					*out++ = (uint8_t)row_filter;
 					idat.adler = upng__adler32(idat.adler, out - 1, 1);
 					idat.crc = upng__crc32(idat.crc, out - 1, 1);
 				}

 				const uint8_t* pix = (const uint8_t*)src + y * pitch + x * bpp;
 				upng__row(&idat, out, pix, pitch, pixel_size, format, row_filter);
 				out += pixel_size;
 			}
 		}

 		// end of IDAT
 		*out++ = (uint8_t)(idat.crc >> 24);
 		*out++ = (uint8_t)(idat.crc >> 16);
 		*out++ = (uint8_t)(idat.crc >> 8);
 		*out++ = (uint8_t)(idat.crc);
 	}

 	// one more IDAT with empty zlib block (1+4 bytes) and adler32 checksum (4 bytes)
 	{
 		uint32_t idat_size = 1 + 4 + 4;
 		*out++ = 0;
 		*out++ = 0;
 		*out++ = 0;
 		*out++ = (uint8_t)idat_size;
 		*out++ = (uint8_t)'I';
 		*out++ = (uint8_t)'D';
 		*out++ = (uint8_t)'A';
 		*out++ = (uint8_t)'T';
 		*out++ = 1; // BFINAL=1, BTYPE=0
 		*out++ = 0; // LEN = 0x0000
 		*out++ = 0;
 		*out++ = 0xff; // NLEN = ~LEN = 0xffff
 		*out++ = 0xff;
 		*out++ = (uint8_t)(idat.adler >> 24);
 		*out++ = (uint8_t)(idat.adler >> 16);
 		*out++ = (uint8_t)(idat.adler >> 8);
 		*out++ = (uint8_t)(idat.adler);
 		uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - (4 + idat_size), 4 + idat_size);
 		*out++ = (uint8_t)(crc >> 24);
 		*out++ = (uint8_t)(crc >> 16);
 		*out++ = (uint8_t)(crc >> 8);
 		*out++ = (uint8_t)(crc);
 	}

 	// IEND, https://www.w3.org/TR/png/#11IEND
 	for (size_t i = 0; i < sizeof(iend_chunk) - 1; i++)
 	{
 		*out++ = (uint8_t)iend_chunk[i];
 	}

 	return out - (uint8_t*)dst;
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 typedef struct {
 	const uint8_t* in;
 	size_t size;

 	// how many bytes available in current IDAT chunk
 	uint32_t chunk_size;

 	// how many bytes available in current zlib block
 	uint32_t block_size;

 	// BFINAL bit of last zlib block
 	int bfinal;

 	// temporary buffer used to read data that could cross IDAT chunk boundaries
 	// 2 bytes for zlib format header in the beginning
 	// 4 bytes adler32 checksum at the end
 	// 5 bytes for zlib block header
 	uint8_t temp[5];
 	uint32_t temp_size;

 	// max 8 bytes of one pixel (RGBA16) that could potentially be split across zlib blocks
 	uint8_t split_block[8];
 	uint32_t split_size;
 } upng__data;

 #define UPNG__CHUNK_IHDR 0x52444849
 #define UPNG__CHUNK_IDAT 0x54414449
 #define UPNG__CHUNK_IEND 0x444e4549

 // parses next chunk from input data, returns 4 bytes of chunk type, sets "chunk_size" in "data"
 static uint32_t upng__next_chunk(upng__data* data)
 {
 	// skip crc32 of previous chunk
 	data->in += 4;
 	data->size -= 4;

 	if (data->size < 4 + 4 + 4)
 	{
 		// not enough input for chunk size/type/crc
 		return 0;
 	}
 	const uint8_t* in = data->in;

 	data->chunk_size = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3];
 	data->in += 4;
 	data->size -= 4;

 	// chunk type
 	data->in += 4;
 	data->size -= 4;

 	if (data->chunk_size > upng__max_chunk_size || data->size < data->chunk_size + 4)
 	{
 		// bad chunk size, or not enough input provided
 		return 0;
 	}

 	return in[4] | (in[5] << 8) | (in[6] << 16) | (in[7] << 24);
 }

 // provides exactly "size" bytes from IDAT chunk payload bytes
 static const uint8_t* upng__data_expect(upng__data* data, uint32_t size)
 {
 	if (size <= data->chunk_size)
 	{
 		// there are enough bytes in current IDAT chunk
 		return data->in;
 	}

 	// otherwise data is split across multiple IDAT chunks
 	uint32_t temp_size = 0;
 	for (;;)
 	{
 		// copy available bytes to temp buffer
 		uint32_t avail = size < data->chunk_size ? (uint32_t)size : data->chunk_size;
 		for (size_t i = 0; i < avail; i++)
 		{
 			data->temp[temp_size++] = *data->in++;
 		}
 		size -= avail;
 		data->size -= avail;
 		data->chunk_size -= avail;

 		if (size == 0)
 		{
 			break;
 		}

 		// find next non-empty IDAT chunk
 		while (data->chunk_size == 0)
 		{
 			if (upng__next_chunk(data) != UPNG__CHUNK_IDAT)
 			{
 				// not enough input
 				return NULL;
 			}
 		}

 		if (temp_size == 0 && size <= data->chunk_size)
 		{
 			// amount of bytes requested fully fit into first chunk (because current one was empty)
 			return data->in;
 		}
 	}
 	data->temp_size = temp_size;
 	return data->temp;
 }

 // consumes "size" bytes from IDAT chunk payload
 static void upng__data_use(upng__data* data, uint32_t size)
 {
 	if (data->temp_size == 0)
 	{
 		// consume bytes from IDAT chunk payload
 		data->in += size;
 		data->size -= size;
 		data->chunk_size -= size;
 	}
 	else
 	{
 		// consume bytes from "temp"
 		data->temp_size = 0;
 	}
 }

 // provides at least "min_size" bytes from zlib block payload, returns actually usable amount in "size"
 static const uint8_t* upng__block_expect(upng__data* data, uint32_t min_size, uint32_t* size)
 {
 	uint32_t avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size;
 	if (min_size <= avail_size)
 	{
 		// there are enough bytes
 		*size = avail_size;
 		return data->in;
 	}
 	
 	// otherwise data is split across multiple zlib blocks
 	uint32_t split_size = 0;
 	for (;;)
 	{
 		// copy available bytes into split_block
 		uint32_t avail = min_size < avail_size ? min_size : avail_size;
 		for (size_t i = 0; i < avail; i++)
 		{
 			data->split_block[split_size++] = data->in[i];
 		}
 		upng__data_use(data, avail);
 		data->block_size -= avail;
 		min_size -= avail;
 		avail_size -= avail;

 		if (min_size == 0)
 		{
 			break;
 		}

 		// in case zlib block spans multiple IDAT chunks, read next IDAT chunk
 		if (data->chunk_size == 0)
 		{
 			// find next non-empty IDAT chunk
 			while (data->chunk_size == 0)
 			{
 				if (upng__next_chunk(data) != UPNG__CHUNK_IDAT)
 				{
 					// not enough input
 					return NULL;
 				}
 			}
 		}

 		// find next non-empty zlib block
 		while (data->block_size == 0)
 		{
 			if (data->bfinal)
 			{
 				// after BFINAL=1 there are no more zlib blocks expected
 				return NULL;
 			}

 			const uint8_t* in;
 			if (!(in = upng__data_expect(data, 5)))
 			{
 				// not enough input
 				return NULL;
 			}

 			uint8_t block_type = in[0];
 			if ((block_type >> 1) != 0)
 			{
 				// upng supports only uncompressed zlib blocks (BTYPE=0)
 				return NULL;
 			}

 			uint16_t block_size = in[1] | (in[2] << 8);
 			uint16_t size_check = in[3] | (in[4] << 8);
 			if ((uint16_t)block_size != (uint16_t)~size_check)
 			{
 				// bad zlib block size (LEN/NLEN)
 				return NULL;
 			}
 			upng__data_use(data, 5);

 			data->bfinal = block_type & 1;
 			data->block_size = block_size;
 		}
 		avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size;

 		if (split_size == 0 && min_size <= avail_size)
 		{
 			// requested amount of bytes fully fit into first block (becuase current one was empty)
 			*size = avail_size;
 			return data->in;
 		}
 	}
 	data->split_size = split_size;
 	*size = split_size;
 	return data->split_block;
 }

 // consumes "size" bytes from zlib block payload
 static void upng__block_use(upng__data* data, uint32_t size)
 {
 	if (data->split_size  == 0)
 	{
 		// consume bytes from zlib block
 		upng__data_use(data, size);
 		data->block_size -= size;
 	}
 	else
 	{
 		// consume bytes from "split_block"
 		data->split_size = 0;
 	}
 }

 size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags)
 {
 	const uint8_t* in = (const uint8_t*)src;

 	// png signature
 	{
 		if (size < sizeof(upng__sig) - 1)
 		{
 			// not enough input bytes for png signature
 			return 0;
 		}

 		for (size_t i = 0; i < sizeof(upng__sig) - 1; i++)
 		{
 			if (in[i] != (uint8_t)upng__sig[i])
 			{
 				// bad png signature
 				return 0;
 			}
 		}
 		in += sizeof(upng__sig) - 1;
 		size -= sizeof(upng__sig) - 1;
 	}

 	uint32_t w, h;
 	upng_format fmt;

 	upng__data data = { 0 };

 	// back up by 4 bytes, because upng__next_chunk will skip crc32
 	data.in = in - 4;
 	data.size = size + 4;

 	// IHDR chunk
 	{
 		if (upng__next_chunk(&data) != UPNG__CHUNK_IHDR)
 		{
 			// first chunk must be IHDR
 			return 0;
 		}

 		if (data.chunk_size != upng__ihdr_size)
 		{
 			// bad IHDR size
 			return 0;
 		}

 		const uint8_t* in = data.in;
 		w = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3];
 		h = (in[4] << 24) | (in[5] << 16) | (in[6] << 8) | in[7];
 		uint8_t bits = in[8];
 		uint8_t type = in[9];
 		uint8_t compression = in[10];
 		uint8_t filter = in[11];
 		uint8_t interlace = in[12];
 		data.in += upng__ihdr_size;
 		data.size -= upng__ihdr_size;

 		if (w == 0 || h == 0					// invalid width or height values
 			|| (size_t)w * h < (size_t)w		// width * height overflows
 			|| (size_t)w * h >(size_t)1 << 48)	// width * height takes too much memory
 		{
 			return 0;
 		}

 		int bgr = !!(flags & UPNG_READ_SWAP_TO_BGR);

 		if (bits == 8)
 		{
 			switch (type)
 			{
 			case 0: fmt = UPNG_FORMAT_G8; break;
 			case 2: fmt = bgr ? UPNG_FORMAT_BGR8 : UPNG_FORMAT_RGB8; break;
 			case 4: fmt = UPNG_FORMAT_GA8; break;
 			case 6: fmt = bgr ? UPNG_FORMAT_BGRA8 : UPNG_FORMAT_RGBA8; break;
 			default: return 0; // unsupported 8-bit format
 			}
 		}
 		else if (bits == 16)
 		{
 			switch (type)
 			{
 			case 0: fmt = UPNG_FORMAT_G16; break;
 			case 2: fmt = bgr ? UPNG_FORMAT_BGR16 : UPNG_FORMAT_RGB16; break;
 			case 4: fmt = UPNG_FORMAT_GA16; break;
 			case 6: fmt = bgr ? UPNG_FORMAT_BGRA16 : UPNG_FORMAT_RGBA16; break;
 			default: return 0; // unsupported 16-bit format
 			}
 		}
 		else
 		{
 			// unsupported bit count
 			return 0;
 		}
 		if (compression != 0)
 		{
 			// unsupported compression method
 			return 0;
 		}
 		if (filter != 0)
 		{
 			// unsupported filter method
 			return 0;
 		}
 		if (interlace != 0)
 		{
 			// unsupported interlace method
 			return 0;
 		}

 		*width = w;
 		*height = h;
 		*format = fmt;
 	}

 	const uint32_t bpp = upng__bpp[fmt];
 	if (pitch == 0)
 	{
 		if ((size_t)w * bpp < (size_t)bpp)
 		{
 			// width too large, overflows size_t
 			return 0;
 		}

 		pitch = (size_t)w * bpp;
 	}

 	if (h * pitch < pitch)
 	{
 		// pitch too large, overflows size_t
 		return 0;
 	}

 	if (dst == NULL)
 	{
 		// done! only IHDR info requested
 		return h * pitch;
 	}

 	// skip chunks until first IDAT chunk
 	for (;;)
 	{
 		uint32_t type = upng__next_chunk(&data);
 		if (type == UPNG__CHUNK_IDAT)
 		{
 			break;
 		}
 		else if ((char)type < 'a' || (char)type > 'z')
 		{
 			// only "non-critical" chunks allowed
 			return 0;
 		}
 		// ignore optional chunk payload
 		data.in += data.chunk_size;
 		data.size -= data.chunk_size;
 	}

 	// IDAT chunk payload starts with 2 bytes for zlib format
 	{
 		if (!(in = upng__data_expect(&data, 2)))
 		{
 			// not enough input
 			return 0;
 		}

 		uint32_t cmf = in[0]; // CM & CINFO
 		uint32_t flg = in[1]; // FCHECK, FDICT, FLEVEL

 		if ((cmf & 0xf) != 0x8)
 		{
 			// CM must be 8 = deflate compression method
 			return 0;
 		}

 		if (flg & (1 << 5))
 		{
 			// FDICT must be 0 = no dictionary
 			return 0;
 		}

 		if ((cmf * 256 + flg) % 31 != 0)
 		{
 			// bad FCHECK value
 			return 0;
 		}

 		upng__data_use(&data, 2);
 	}

 	for (size_t y = 0; y < h; y++)
 	{
 		uint32_t in_avail;
 		if (!(in = upng__block_expect(&data, 1, &in_avail)))
 		{
 			// not enough input for row filter byte
 			return 0;
 		}

 		uint8_t row_filter = in[0];
 		if (row_filter != UPNG_FILTER_NONE && row_filter != UPNG_FILTER_UP)
 		{
 			// upng supports only NONE and UP filters
 			return 0;
 		}
 		upng__block_use(&data, 1);

 		if (row_filter == UPNG_FILTER_UP && y == 0)
 		{
 			// if first row uses UP filter, force it to use NONE
 			row_filter = UPNG_FILTER_NONE;
 		}

 		uint32_t x = 0;
 		while (x < w)
 		{
 			if (!(in = upng__block_expect(&data, bpp, &in_avail)))
 			{
 				// not enough input for at least one more pixel
 				return 0;
 			}

 			uint32_t pixel_count = in_avail / bpp;
 			pixel_count = w - x < pixel_count ? w - x : pixel_count;
 			uint32_t pixel_size = pixel_count * bpp;

 			uint8_t* pix_out = (uint8_t*)dst + y * pitch + x * bpp;
 			upng__unrow(pix_out, in, pitch, pixel_size, fmt, (upng_filter)row_filter);

 			x += pixel_count;
 			upng__block_use(&data, pixel_size);
 		}
 	}

 	if (data.block_size != 0)
 	{
 		// no more bytes expected in zlib payload
 		return 0;
 	}

 	// expect empty zlib blocks until BFINAL=1
 	while (data.bfinal == 0)
 	{
 		if (!(in = upng__data_expect(&data, 5)))
 		{
 			// not enough input
 			return 0;
 		}

 		uint8_t block_type = in[0];
 		if ((block_type >> 1) != 0)
 		{
 			// upng supports only uncompressed zlib blocks (BTYPE=0)
 			return 0;
 		}

 		uint16_t block_size = in[1] | (in[2] << 8);
 		uint16_t size_check = in[3] | (in[4] << 8);
 		if (block_size != 0 || size_check != 0xffff)
 		{
 			// expected 0-sized zlib block
 			return 0;
 		}
 		upng__data_use(&data, 5);

 		data.bfinal = block_type & 1;
 	}

 	// skip adler32 checksum
 	if (!(in = upng__data_expect(&data, 4)))
 	{
 		return 0;
 	}
 	upng__data_use(&data, 4);

 	if (data.chunk_size != 0)
 	{
 		// no more bytes expected in IDAT chunks
 		return 0;
 	}

 	// skip until IEND chunk
 	for (;;)
 	{
 		uint32_t type = upng__next_chunk(&data);
 		if (type == UPNG__CHUNK_IEND)
 		{
 			break;
 		}
 		else if ((char)type < 'a' || (char)type > 'z')
 		{
 			// only "non-critical" chunks allowed
 			return 0;
 		}
 		// ignore optional chunk payload
 		data.in += data.chunk_size;
 		data.size -= data.chunk_size;
 	}

 	if (data.chunk_size != 0)
 	{
 		// IEND chunk size must be 0
 		return 0;
 	}

 	// ignore crc32 of IEND chunk
 	if (data.size != 4)
 	{
 		// unexpected length of input
 		return 0;
 	}

 	// OK!
 	return h * pitch;
 }