ilyakurdyukov · June 18, 2023 17:04 · amonakov · Dec 13, 2021 · amonakov · Dec 13, 2021
diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
 From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <[email protected]>
 Date: Tue, 14 Dec 2021 21:54:32 +0700
 Subject: [PATCH] faster lzma_decoder for x86

 Notice: Uses inline assembly with CMOV instruction.

 Another change that removes the comparison with in_size can give a few
 percent speedup for architectures with a small number of registers.
 ---
 src/liblzma/lzma/lzma_decoder.c        | 78 +++++++++++++-------------
 src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
 2 files changed, 113 insertions(+), 43 deletions(-)

 diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
 index e605a0a..ecb804b 100644
 --- a/src/liblzma/lzma/lzma_decoder.c
 +++ b/src/liblzma/lzma/lzma_decoder.c
 @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							= offset + match_bit
 							+ symbol;
 
 -					rc_bit(probs[subcoder_index],
 -							offset &= ~match_bit,
 -							offset &= match_bit,
 +					rc_bit_matched(probs[subcoder_index],
 							SEQ_LITERAL_MATCHED);
 
 					// It seems to be faster to do this
 @@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		case seq: \
 			match_bit = len & offset; \
 			subcoder_index = offset + match_bit + symbol; \
 -			rc_bit(probs[subcoder_index], \
 -					offset &= ~match_bit, \
 -					offset &= match_bit, \
 -					seq)
 +			rc_bit_matched(probs[subcoder_index], seq)
 
 				d(SEQ_LITERAL_MATCHED0);
 				len <<= 1;
 @@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					probs = coder->pos_special + rep0
 							- symbol - 1;
 					symbol = 1;
 -					offset = 0;
 -	case SEQ_DIST_MODEL:
 +					offset = 1;
 #ifdef HAVE_SMALL
 +					limit = 1 << limit;
 +	case SEQ_DIST_MODEL:
 					do {
 -						rc_bit(probs[symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 -					} while (++offset < limit);
 +						offset <<= 1;
 +					} while (offset < limit);
 #else
 +	case SEQ_DIST_MODEL:
 					switch (limit) {
 					case 5:
 						assert(offset == 0);
 -						rc_bit(probs[symbol], ,
 -							rep0 += 1U,
 +						rc_bit_dist(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 -						++offset;
 +						offset <<= 1;
 						--limit;
 					case 4:
 -						rc_bit(probs[symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 -						++offset;
 +						offset <<= 1;
 						--limit;
 					case 3:
 -						rc_bit(probs[symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 -						++offset;
 +						offset <<= 1;
 						--limit;
 					case 2:
 -						rc_bit(probs[symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 -						++offset;
 +						offset <<= 1;
 						--limit;
 					case 1:
 						// We need "symbol" only for
 @@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 						// rc_bit_last() here to omit
 						// the unneeded updating of
 						// "symbol".
 -						rc_bit_last(probs[symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist_last(probs[symbol],
 +							offset,
 							SEQ_DIST_MODEL);
 					}
 #endif
 @@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
 #ifdef HAVE_SMALL
 -					offset = 0;
 +					offset = 1;
 	case SEQ_ALIGN:
 					do {
 -						rc_bit(coder->pos_align[
 -								symbol], ,
 -							rep0 += 1U << offset,
 +						rc_bit_dist(coder->pos_align[
 +								symbol],
 +							offset,
 							SEQ_ALIGN);
 -					} while (++offset < ALIGN_BITS);
 +						offset <<= 1;
 +					} while (offset < (1 << ALIGN_BITS));
 #else
 	case SEQ_ALIGN0:
 -					rc_bit(coder->pos_align[symbol], ,
 -							rep0 += 1, SEQ_ALIGN0);
 +					rc_bit_dist(coder->pos_align[symbol],
 +							1, SEQ_ALIGN0);
 	case SEQ_ALIGN1:
 -					rc_bit(coder->pos_align[symbol], ,
 -							rep0 += 2, SEQ_ALIGN1);
 +					rc_bit_dist(coder->pos_align[symbol],
 +							2, SEQ_ALIGN1);
 	case SEQ_ALIGN2:
 -					rc_bit(coder->pos_align[symbol], ,
 -							rep0 += 4, SEQ_ALIGN2);
 +					rc_bit_dist(coder->pos_align[symbol],
 +							4, SEQ_ALIGN2);
 	case SEQ_ALIGN3:
 					// Like in SEQ_DIST_MODEL, we don't
 					// need "symbol" for anything else
 					// than indexing the probability array.
 -					rc_bit_last(coder->pos_align[symbol], ,
 -							rep0 += 8, SEQ_ALIGN3);
 +					rc_bit_dist_last(coder->pos_align[
 +								symbol],
 +							8, SEQ_ALIGN3);
 #endif
 
 					if (rep0 == UINT32_MAX) {
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
 index e0b051f..cc9ac35 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 /// variables `in' and `in_size' to be defined.
 #define rc_to_local(range_decoder, in_pos) \
 	lzma_range_decoder rc = range_decoder; \
 -	size_t rc_in_pos = (in_pos); \
 +	size_t rc_in_pos = in_pos - in_size; \
 +	const uint8_t *restrict rc_end = in + in_size; \
 	uint32_t rc_bound
 
 
 @@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 #define rc_from_local(range_decoder, in_pos) \
 do { \
 	range_decoder = rc; \
 -	in_pos = rc_in_pos; \
 +	in_pos = rc_in_pos + in_size; \
 } while (0)
 
 
 @@ -87,12 +88,13 @@ do { \
 #define rc_normalize(seq) \
 do { \
 	if (rc.range < RC_TOP_VALUE) { \
 -		if (unlikely(rc_in_pos == in_size)) { \
 +		if (unlikely(!rc_in_pos)) { \
 			coder->sequence = seq; \
 			goto out; \
 		} \
 +		rc.code <<= RC_SHIFT_BITS; \
 		rc.range <<= RC_SHIFT_BITS; \
 -		rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
 +		rc.code |= rc_end[rc_in_pos++]; \
 	} \
 } while (0)
 
 @@ -151,11 +153,79 @@ do { \
 
 /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
 /// on the decoded bit.
 +#ifndef NO_BRANCH_OPT
 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
 +do { \
 +	rc_normalize(seq); \
 +	uint32_t cache = (prob), tmp; \
 +	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
 +	rc.range -= rc_bound; \
 +	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
 +	__asm__ ( \
 +		"cmpl %3, %2\n\t" \
 +		"cmovbl %3, %1\n\t" \
 +		"sbbl %0, %0" \
 +		: "=&r"(tmp), "+&r"(rc.range) \
 +		: "r"(rc.code), "r"(rc_bound) \
 +		: "flags" \
 +	); \
 +	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
 +	prob -= cache >> RC_MOVE_BITS; \
 +	pre0; tmp = ~tmp; pre1; \
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
 +#elif defined(__GNUC__) && defined(__aarch64__)
 +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
 +do { \
 +	rc_normalize(seq); \
 +	uint32_t cache = (prob), tmp; \
 +	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
 +	rc.range -= rc_bound; \
 +	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
 +	__asm__ ( \
 +		"cmp %w2, %w3\n\t" \
 +		"csel %w1, %w3, %w1, lo\n\t" \
 +		"csetm %w0, lo" \
 +		: "=&r"(tmp), "+&r"(rc.range) \
 +		: "r"(rc.code), "r"(rc_bound) \
 +		: "cc" \
 +	); \
 +	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
 +	prob -= cache >> RC_MOVE_BITS; \
 +	pre0; tmp = ~tmp; pre1; \
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
 +#endif
 +#endif
 +#ifdef rc_bit_nobranch
 +#define rc_bit(prob, action0, action1, seq) \
 +	rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
 +		action0, action1, seq)
 +#define rc_bit_matched(prob, seq) \
 +	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
 +		symbol = (symbol << 1) - tmp, , , seq)
 +#define rc_bit_dist(prob, offset, seq) \
 +	rc_bit_nobranch(prob, , \
 +		symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
 +		, , seq);
 +#define rc_bit_dist_last(prob, offset, seq) \
 +	rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
 +#else
 #define rc_bit(prob, action0, action1, seq) \
 	rc_bit_last(prob, \
 		symbol <<= 1; action0, \
 		symbol = (symbol << 1) + 1; action1, \
 		seq);
 +#define rc_bit_matched(prob, seq) \
 +	rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
 +#define rc_bit_dist(prob, offset, seq) \
 +	rc_bit(prob, , rep0 += offset, seq);
 +#define rc_bit_dist_last(prob, offset, seq) \
 +	rc_bit_last(prob, , rep0 += offset, seq)
 +#endif
 
 
 /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
 -- 
 2.17.1
	From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
	From: Ilya Kurdyukov <[email protected]>
	Date: Tue, 14 Dec 2021 21:54:32 +0700
	Subject: [PATCH] faster lzma_decoder for x86

	Notice: Uses inline assembly with CMOV instruction.

	Another change that removes the comparison with in_size can give a few
	percent speedup for architectures with a small number of registers.
	---
	src/liblzma/lzma/lzma_decoder.c \| 78 +++++++++++++-------------
	src/liblzma/rangecoder/range_decoder.h \| 78 ++++++++++++++++++++++++--
	2 files changed, 113 insertions(+), 43 deletions(-)

	diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
	index e605a0a..ecb804b 100644
	--- a/src/liblzma/lzma/lzma_decoder.c
	+++ b/src/liblzma/lzma/lzma_decoder.c
	@@ -415,9 +415,7 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	= offset + match_bit
	+ symbol;

	- rc_bit(probs[subcoder_index],
	- offset &= ~match_bit,
	- offset &= match_bit,
	+ rc_bit_matched(probs[subcoder_index],
	SEQ_LITERAL_MATCHED);

	// It seems to be faster to do this
	@@ -437,10 +435,7 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	case seq: \
	match_bit = len & offset; \
	subcoder_index = offset + match_bit + symbol; \
	- rc_bit(probs[subcoder_index], \
	- offset &= ~match_bit, \
	- offset &= match_bit, \
	- seq)
	+ rc_bit_matched(probs[subcoder_index], seq)

	d(SEQ_LITERAL_MATCHED0);
	len <<= 1;
	@@ -564,40 +559,43 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	probs = coder->pos_special + rep0
	- symbol - 1;
	symbol = 1;
	- offset = 0;
	- case SEQ_DIST_MODEL:
	+ offset = 1;
	#ifdef HAVE_SMALL
	+ limit = 1 << limit;
	+ case SEQ_DIST_MODEL:
	do {
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- } while (++offset < limit);
	+ offset <<= 1;
	+ } while (offset < limit);
	#else
	+ case SEQ_DIST_MODEL:
	switch (limit) {
	case 5:
	assert(offset == 0);
	- rc_bit(probs[symbol], ,
	- rep0 += 1U,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 4:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 3:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 2:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 1:
	// We need "symbol" only for
	@@ -606,8 +604,8 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	// rc_bit_last() here to omit
	// the unneeded updating of
	// "symbol".
	- rc_bit_last(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist_last(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	}
	#endif
	@@ -630,30 +628,32 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	rep0 <<= ALIGN_BITS;
	symbol = 1;
	#ifdef HAVE_SMALL
	- offset = 0;
	+ offset = 1;
	case SEQ_ALIGN:
	do {
	- rc_bit(coder->pos_align[
	- symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(coder->pos_align[
	+ symbol],
	+ offset,
	SEQ_ALIGN);
	- } while (++offset < ALIGN_BITS);
	+ offset <<= 1;
	+ } while (offset < (1 << ALIGN_BITS));
	#else
	case SEQ_ALIGN0:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 1, SEQ_ALIGN0);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 1, SEQ_ALIGN0);
	case SEQ_ALIGN1:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 2, SEQ_ALIGN1);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 2, SEQ_ALIGN1);
	case SEQ_ALIGN2:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 4, SEQ_ALIGN2);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 4, SEQ_ALIGN2);
	case SEQ_ALIGN3:
	// Like in SEQ_DIST_MODEL, we don't
	// need "symbol" for anything else
	// than indexing the probability array.
	- rc_bit_last(coder->pos_align[symbol], ,
	- rep0 += 8, SEQ_ALIGN3);
	+ rc_bit_dist_last(coder->pos_align[
	+ symbol],
	+ 8, SEQ_ALIGN3);
	#endif

	if (rep0 == UINT32_MAX) {
	diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
	index e0b051f..cc9ac35 100644
	--- a/src/liblzma/rangecoder/range_decoder.h
	+++ b/src/liblzma/rangecoder/range_decoder.h
	@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder rc, const uint8_t restrict in,
	/// variables `in' and `in_size' to be defined.
	#define rc_to_local(range_decoder, in_pos) \
	lzma_range_decoder rc = range_decoder; \
	- size_t rc_in_pos = (in_pos); \
	+ size_t rc_in_pos = in_pos - in_size; \
	+ const uint8_t *restrict rc_end = in + in_size; \
	uint32_t rc_bound


	@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder rc, const uint8_t restrict in,
	#define rc_from_local(range_decoder, in_pos) \
	do { \
	range_decoder = rc; \
	- in_pos = rc_in_pos; \
	+ in_pos = rc_in_pos + in_size; \
	} while (0)


	@@ -87,12 +88,13 @@ do { \
	#define rc_normalize(seq) \
	do { \
	if (rc.range < RC_TOP_VALUE) { \
	- if (unlikely(rc_in_pos == in_size)) { \
	+ if (unlikely(!rc_in_pos)) { \
	coder->sequence = seq; \
	goto out; \
	} \
	+ rc.code <<= RC_SHIFT_BITS; \
	rc.range <<= RC_SHIFT_BITS; \
	- rc.code = (rc.code << RC_SHIFT_BITS) \| in[rc_in_pos++]; \
	+ rc.code \|= rc_end[rc_in_pos++]; \
	} \
	} while (0)

	@@ -151,11 +153,79 @@ do { \

	/// Decodes one bit, updates "symbol", and runs action0 or action1 depending
	/// on the decoded bit.
	+#ifndef NO_BRANCH_OPT
	+#if defined(__GNUC__) && (defined(__i386__) \|\| defined(__x86_64__))
	+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
	+do { \
	+ rc_normalize(seq); \
	+ uint32_t cache = (prob), tmp; \
	+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
	+ rc.range -= rc_bound; \
	+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
	+ __asm__ ( \
	+ "cmpl %3, %2\n\t" \
	+ "cmovbl %3, %1\n\t" \
	+ "sbbl %0, %0" \
	+ : "=&r"(tmp), "+&r"(rc.range) \
	+ : "r"(rc.code), "r"(rc_bound) \
	+ : "flags" \
	+ ); \
	+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
	+ prob -= cache >> RC_MOVE_BITS; \
	+ pre0; tmp = ~tmp; pre1; \
	+ rc.code -= tmp & rc_bound; \
	+ if (!tmp) { action0; } else { action1; }; \
	+} while (0)
	+#elif defined(__GNUC__) && defined(__aarch64__)
	+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
	+do { \
	+ rc_normalize(seq); \
	+ uint32_t cache = (prob), tmp; \
	+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
	+ rc.range -= rc_bound; \
	+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
	+ __asm__ ( \
	+ "cmp %w2, %w3\n\t" \
	+ "csel %w1, %w3, %w1, lo\n\t" \
	+ "csetm %w0, lo" \
	+ : "=&r"(tmp), "+&r"(rc.range) \
	+ : "r"(rc.code), "r"(rc_bound) \
	+ : "cc" \
	+ ); \
	+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
	+ prob -= cache >> RC_MOVE_BITS; \
	+ pre0; tmp = ~tmp; pre1; \
	+ rc.code -= tmp & rc_bound; \
	+ if (!tmp) { action0; } else { action1; }; \
	+} while (0)
	+#endif
	+#endif
	+#ifdef rc_bit_nobranch
	+#define rc_bit(prob, action0, action1, seq) \
	+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
	+ action0, action1, seq)
	+#define rc_bit_matched(prob, seq) \
	+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
	+ symbol = (symbol << 1) - tmp, , , seq)
	+#define rc_bit_dist(prob, offset, seq) \
	+ rc_bit_nobranch(prob, , \
	+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
	+ , , seq);
	+#define rc_bit_dist_last(prob, offset, seq) \
	+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
	+#else
	#define rc_bit(prob, action0, action1, seq) \
	rc_bit_last(prob, \
	symbol <<= 1; action0, \
	symbol = (symbol << 1) + 1; action1, \
	seq);
	+#define rc_bit_matched(prob, seq) \
	+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
	+#define rc_bit_dist(prob, offset, seq) \
	+ rc_bit(prob, , rep0 += offset, seq);
	+#define rc_bit_dist_last(prob, offset, seq) \
	+ rc_bit_last(prob, , rep0 += offset, seq)
	+#endif


	/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
	--
	2.17.1