Last active
June 18, 2023 17:04
-
-
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Faster LZMA decoder for x86 CPUs (patch for XZ Utils).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001 | |
From: Ilya Kurdyukov <[email protected]> | |
Date: Tue, 14 Dec 2021 21:54:32 +0700 | |
Subject: [PATCH] faster lzma_decoder for x86 | |
Notice: Uses inline assembly with CMOV instruction. | |
Another change that removes the comparison with in_size can give a few | |
percent speedup for architectures with a small number of registers. | |
--- | |
src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++------------- | |
src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++-- | |
2 files changed, 113 insertions(+), 43 deletions(-) | |
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c | |
index e605a0a..ecb804b 100644 | |
--- a/src/liblzma/lzma/lzma_decoder.c | |
+++ b/src/liblzma/lzma/lzma_decoder.c | |
@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
= offset + match_bit | |
+ symbol; | |
- rc_bit(probs[subcoder_index], | |
- offset &= ~match_bit, | |
- offset &= match_bit, | |
+ rc_bit_matched(probs[subcoder_index], | |
SEQ_LITERAL_MATCHED); | |
// It seems to be faster to do this | |
@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
case seq: \ | |
match_bit = len & offset; \ | |
subcoder_index = offset + match_bit + symbol; \ | |
- rc_bit(probs[subcoder_index], \ | |
- offset &= ~match_bit, \ | |
- offset &= match_bit, \ | |
- seq) | |
+ rc_bit_matched(probs[subcoder_index], seq) | |
d(SEQ_LITERAL_MATCHED0); | |
len <<= 1; | |
@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
probs = coder->pos_special + rep0 | |
- symbol - 1; | |
symbol = 1; | |
- offset = 0; | |
- case SEQ_DIST_MODEL: | |
+ offset = 1; | |
#ifdef HAVE_SMALL | |
+ limit = 1 << limit; | |
+ case SEQ_DIST_MODEL: | |
do { | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- } while (++offset < limit); | |
+ offset <<= 1; | |
+ } while (offset < limit); | |
#else | |
+ case SEQ_DIST_MODEL: | |
switch (limit) { | |
case 5: | |
assert(offset == 0); | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 4: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 3: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 2: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 1: | |
// We need "symbol" only for | |
@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
// rc_bit_last() here to omit | |
// the unneeded updating of | |
// "symbol". | |
- rc_bit_last(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist_last(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
} | |
#endif | |
@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
rep0 <<= ALIGN_BITS; | |
symbol = 1; | |
#ifdef HAVE_SMALL | |
- offset = 0; | |
+ offset = 1; | |
case SEQ_ALIGN: | |
do { | |
- rc_bit(coder->pos_align[ | |
- symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(coder->pos_align[ | |
+ symbol], | |
+ offset, | |
SEQ_ALIGN); | |
- } while (++offset < ALIGN_BITS); | |
+ offset <<= 1; | |
+ } while (offset < (1 << ALIGN_BITS)); | |
#else | |
case SEQ_ALIGN0: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 1, SEQ_ALIGN0); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 1, SEQ_ALIGN0); | |
case SEQ_ALIGN1: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 2, SEQ_ALIGN1); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 2, SEQ_ALIGN1); | |
case SEQ_ALIGN2: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 4, SEQ_ALIGN2); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 4, SEQ_ALIGN2); | |
case SEQ_ALIGN3: | |
// Like in SEQ_DIST_MODEL, we don't | |
// need "symbol" for anything else | |
// than indexing the probability array. | |
- rc_bit_last(coder->pos_align[symbol], , | |
- rep0 += 8, SEQ_ALIGN3); | |
+ rc_bit_dist_last(coder->pos_align[ | |
+ symbol], | |
+ 8, SEQ_ALIGN3); | |
#endif | |
if (rep0 == UINT32_MAX) { | |
diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h | |
index e0b051f..cc9ac35 100644 | |
--- a/src/liblzma/rangecoder/range_decoder.h | |
+++ b/src/liblzma/rangecoder/range_decoder.h | |
@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
/// variables `in' and `in_size' to be defined. | |
#define rc_to_local(range_decoder, in_pos) \ | |
lzma_range_decoder rc = range_decoder; \ | |
- size_t rc_in_pos = (in_pos); \ | |
+ size_t rc_in_pos = in_pos - in_size; \ | |
+ const uint8_t *restrict rc_end = in + in_size; \ | |
uint32_t rc_bound | |
@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
#define rc_from_local(range_decoder, in_pos) \ | |
do { \ | |
range_decoder = rc; \ | |
- in_pos = rc_in_pos; \ | |
+ in_pos = rc_in_pos + in_size; \ | |
} while (0) | |
@@ -87,12 +88,13 @@ do { \ | |
#define rc_normalize(seq) \ | |
do { \ | |
if (rc.range < RC_TOP_VALUE) { \ | |
- if (unlikely(rc_in_pos == in_size)) { \ | |
+ if (unlikely(!rc_in_pos)) { \ | |
coder->sequence = seq; \ | |
goto out; \ | |
} \ | |
+ rc.code <<= RC_SHIFT_BITS; \ | |
rc.range <<= RC_SHIFT_BITS; \ | |
- rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ | |
+ rc.code |= rc_end[rc_in_pos++]; \ | |
} \ | |
} while (0) | |
@@ -151,11 +153,79 @@ do { \ | |
/// Decodes one bit, updates "symbol", and runs action0 or action1 depending | |
/// on the decoded bit. | |
+#ifndef NO_BRANCH_OPT | |
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | |
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
+do { \ | |
+ rc_normalize(seq); \ | |
+ uint32_t cache = (prob), tmp; \ | |
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
+ rc.range -= rc_bound; \ | |
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
+ __asm__ ( \ | |
+ "cmpl %3, %2\n\t" \ | |
+ "cmovbl %3, %1\n\t" \ | |
+ "sbbl %0, %0" \ | |
+ : "=&r"(tmp), "+&r"(rc.range) \ | |
+ : "r"(rc.code), "r"(rc_bound) \ | |
+ : "flags" \ | |
+ ); \ | |
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
+ prob -= cache >> RC_MOVE_BITS; \ | |
+ pre0; tmp = ~tmp; pre1; \ | |
+ rc.code -= tmp & rc_bound; \ | |
+ if (!tmp) { action0; } else { action1; }; \ | |
+} while (0) | |
+#elif defined(__GNUC__) && defined(__aarch64__) | |
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
+do { \ | |
+ rc_normalize(seq); \ | |
+ uint32_t cache = (prob), tmp; \ | |
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
+ rc.range -= rc_bound; \ | |
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
+ __asm__ ( \ | |
+ "cmp %w2, %w3\n\t" \ | |
+ "csel %w1, %w3, %w1, lo\n\t" \ | |
+ "csetm %w0, lo" \ | |
+ : "=&r"(tmp), "+&r"(rc.range) \ | |
+ : "r"(rc.code), "r"(rc_bound) \ | |
+ : "cc" \ | |
+ ); \ | |
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
+ prob -= cache >> RC_MOVE_BITS; \ | |
+ pre0; tmp = ~tmp; pre1; \ | |
+ rc.code -= tmp & rc_bound; \ | |
+ if (!tmp) { action0; } else { action1; }; \ | |
+} while (0) | |
+#endif | |
+#endif | |
+#ifdef rc_bit_nobranch | |
+#define rc_bit(prob, action0, action1, seq) \ | |
+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ | |
+ action0, action1, seq) | |
+#define rc_bit_matched(prob, seq) \ | |
+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \ | |
+ symbol = (symbol << 1) - tmp, , , seq) | |
+#define rc_bit_dist(prob, offset, seq) \ | |
+ rc_bit_nobranch(prob, , \ | |
+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \ | |
+ , , seq); | |
+#define rc_bit_dist_last(prob, offset, seq) \ | |
+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq); | |
+#else | |
#define rc_bit(prob, action0, action1, seq) \ | |
rc_bit_last(prob, \ | |
symbol <<= 1; action0, \ | |
symbol = (symbol << 1) + 1; action1, \ | |
seq); | |
+#define rc_bit_matched(prob, seq) \ | |
+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) | |
+#define rc_bit_dist(prob, offset, seq) \ | |
+ rc_bit(prob, , rep0 += offset, seq); | |
+#define rc_bit_dist_last(prob, offset, seq) \ | |
+ rc_bit_last(prob, , rep0 += offset, seq) | |
+#endif | |
/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled | |
-- | |
2.17.1 |
Results from ALT Linux team:
Comet Lake:
linux-5.15.8.tar.xz : 5.81 --> 5.30 (+9.6%)
linux-firmware-20211027.tar.xz : 6.14 --> 5.31 (+15.6%)
Intel Xeon (Ivy Bridge EP):
linux-5.15.8.tar.xz : 7.987+-0.363 --> 7.128+-0.288 ( 10.75+-5.80 % )
linux-firmware-20211027.tar.xz : 8.552+-0.354 --> 7.059+-0.310 ( 17.46+-5.51 % )
I wrote scripts to make testing easier:
# downloads test files and builds XZ
./build.sh
# tests decompression speed
./bench.sh ref
./bench.sh new
build.sh
#!/bin/bash
set -e
# patch
[ -f faster_lzma_decoder_x86.patch ] || wget https://gist.githubusercontent.com/ilyakurdyukov/f514418f3affd677e1ac408ec0c09188/raw/faster_lzma_decoder_x86.patch
# test data
[ -f linux-5.15.7.tar.xz ] || wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.7.tar.xz
[ -f linux-firmware-20211027.tar.xz ] || wget https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz
[ -d xz ] || git clone http://git.tukaani.org/xz.git
mkdir -p ref
mkdir -p new
# build ref
cp -r xz build
cd build
# or cmake as an option
./autogen.sh --no-po4a
./configure
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../ref/
# build new
patch -p1 < ../faster_lzma_decoder_x86.patch
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../new/
cd ..
bench.sh
#!/bin/bash
dir=${1:-"ref"}
export LD_LIBRARY_PATH=$(pwd)/$dir
# to make sure the correct liblzma is loaded
ldd $dir/xz | grep liblzma
# or "perf stat -r 5" instead of "for" and "time -p"
for i in 1 2 3; do
echo "# ($dir:$i) linux-5.15.7"
time -p $dir/xz -c -d linux-5.15.7.tar.xz > /dev/null
echo "# ($dir:$i) linux-firmware-20211027"
time -p $dir/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
done
Kunpeng-920 (AArch64)
linux-5.15.7.tar.xz : 8.82 --> 8.78 (+0.4%)
linux-firmware-20211027.tar.xz : 8.97 --> 8.44 (+6.2%)
I got a report that this patch is slowing decoding on the Apple M1, but I am unable to investigate this on my own.
Well, at least it will be easy to exclude it for Apple M1 by adding some #ifndef.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
AArch64, AWS Graviton2 (
c6g.xlarge
size):Before:
After:
~7.9% speedup.