-
-
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001 | |
From: Ilya Kurdyukov <[email protected]> | |
Date: Tue, 14 Dec 2021 21:54:32 +0700 | |
Subject: [PATCH] faster lzma_decoder for x86 | |
Notice: Uses inline assembly with CMOV instruction. | |
Another change that removes the comparison with in_size can give a few | |
percent speedup for architectures with a small number of registers. | |
--- | |
src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++------------- | |
src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++-- | |
2 files changed, 113 insertions(+), 43 deletions(-) | |
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c | |
index e605a0a..ecb804b 100644 | |
--- a/src/liblzma/lzma/lzma_decoder.c | |
+++ b/src/liblzma/lzma/lzma_decoder.c | |
@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
= offset + match_bit | |
+ symbol; | |
- rc_bit(probs[subcoder_index], | |
- offset &= ~match_bit, | |
- offset &= match_bit, | |
+ rc_bit_matched(probs[subcoder_index], | |
SEQ_LITERAL_MATCHED); | |
// It seems to be faster to do this | |
@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
case seq: \ | |
match_bit = len & offset; \ | |
subcoder_index = offset + match_bit + symbol; \ | |
- rc_bit(probs[subcoder_index], \ | |
- offset &= ~match_bit, \ | |
- offset &= match_bit, \ | |
- seq) | |
+ rc_bit_matched(probs[subcoder_index], seq) | |
d(SEQ_LITERAL_MATCHED0); | |
len <<= 1; | |
@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
probs = coder->pos_special + rep0 | |
- symbol - 1; | |
symbol = 1; | |
- offset = 0; | |
- case SEQ_DIST_MODEL: | |
+ offset = 1; | |
#ifdef HAVE_SMALL | |
+ limit = 1 << limit; | |
+ case SEQ_DIST_MODEL: | |
do { | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- } while (++offset < limit); | |
+ offset <<= 1; | |
+ } while (offset < limit); | |
#else | |
+ case SEQ_DIST_MODEL: | |
switch (limit) { | |
case 5: | |
assert(offset == 0); | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 4: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 3: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 2: | |
- rc_bit(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
- ++offset; | |
+ offset <<= 1; | |
--limit; | |
case 1: | |
// We need "symbol" only for | |
@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
// rc_bit_last() here to omit | |
// the unneeded updating of | |
// "symbol". | |
- rc_bit_last(probs[symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist_last(probs[symbol], | |
+ offset, | |
SEQ_DIST_MODEL); | |
} | |
#endif | |
@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, | |
rep0 <<= ALIGN_BITS; | |
symbol = 1; | |
#ifdef HAVE_SMALL | |
- offset = 0; | |
+ offset = 1; | |
case SEQ_ALIGN: | |
do { | |
- rc_bit(coder->pos_align[ | |
- symbol], , | |
- rep0 += 1U << offset, | |
+ rc_bit_dist(coder->pos_align[ | |
+ symbol], | |
+ offset, | |
SEQ_ALIGN); | |
- } while (++offset < ALIGN_BITS); | |
+ offset <<= 1; | |
+ } while (offset < (1 << ALIGN_BITS)); | |
#else | |
case SEQ_ALIGN0: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 1, SEQ_ALIGN0); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 1, SEQ_ALIGN0); | |
case SEQ_ALIGN1: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 2, SEQ_ALIGN1); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 2, SEQ_ALIGN1); | |
case SEQ_ALIGN2: | |
- rc_bit(coder->pos_align[symbol], , | |
- rep0 += 4, SEQ_ALIGN2); | |
+ rc_bit_dist(coder->pos_align[symbol], | |
+ 4, SEQ_ALIGN2); | |
case SEQ_ALIGN3: | |
// Like in SEQ_DIST_MODEL, we don't | |
// need "symbol" for anything else | |
// than indexing the probability array. | |
- rc_bit_last(coder->pos_align[symbol], , | |
- rep0 += 8, SEQ_ALIGN3); | |
+ rc_bit_dist_last(coder->pos_align[ | |
+ symbol], | |
+ 8, SEQ_ALIGN3); | |
#endif | |
if (rep0 == UINT32_MAX) { | |
diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h | |
index e0b051f..cc9ac35 100644 | |
--- a/src/liblzma/rangecoder/range_decoder.h | |
+++ b/src/liblzma/rangecoder/range_decoder.h | |
@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
/// variables `in' and `in_size' to be defined. | |
#define rc_to_local(range_decoder, in_pos) \ | |
lzma_range_decoder rc = range_decoder; \ | |
- size_t rc_in_pos = (in_pos); \ | |
+ size_t rc_in_pos = in_pos - in_size; \ | |
+ const uint8_t *restrict rc_end = in + in_size; \ | |
uint32_t rc_bound | |
@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, | |
#define rc_from_local(range_decoder, in_pos) \ | |
do { \ | |
range_decoder = rc; \ | |
- in_pos = rc_in_pos; \ | |
+ in_pos = rc_in_pos + in_size; \ | |
} while (0) | |
@@ -87,12 +88,13 @@ do { \ | |
#define rc_normalize(seq) \ | |
do { \ | |
if (rc.range < RC_TOP_VALUE) { \ | |
- if (unlikely(rc_in_pos == in_size)) { \ | |
+ if (unlikely(!rc_in_pos)) { \ | |
coder->sequence = seq; \ | |
goto out; \ | |
} \ | |
+ rc.code <<= RC_SHIFT_BITS; \ | |
rc.range <<= RC_SHIFT_BITS; \ | |
- rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ | |
+ rc.code |= rc_end[rc_in_pos++]; \ | |
} \ | |
} while (0) | |
@@ -151,11 +153,79 @@ do { \ | |
/// Decodes one bit, updates "symbol", and runs action0 or action1 depending | |
/// on the decoded bit. | |
+#ifndef NO_BRANCH_OPT | |
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | |
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
+do { \ | |
+ rc_normalize(seq); \ | |
+ uint32_t cache = (prob), tmp; \ | |
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
+ rc.range -= rc_bound; \ | |
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
+ __asm__ ( \ | |
+ "cmpl %3, %2\n\t" \ | |
+ "cmovbl %3, %1\n\t" \ | |
+ "sbbl %0, %0" \ | |
+ : "=&r"(tmp), "+&r"(rc.range) \ | |
+ : "r"(rc.code), "r"(rc_bound) \ | |
+ : "flags" \ | |
+ ); \ | |
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
+ prob -= cache >> RC_MOVE_BITS; \ | |
+ pre0; tmp = ~tmp; pre1; \ | |
+ rc.code -= tmp & rc_bound; \ | |
+ if (!tmp) { action0; } else { action1; }; \ | |
+} while (0) | |
+#elif defined(__GNUC__) && defined(__aarch64__) | |
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ | |
+do { \ | |
+ rc_normalize(seq); \ | |
+ uint32_t cache = (prob), tmp; \ | |
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ | |
+ rc.range -= rc_bound; \ | |
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ | |
+ __asm__ ( \ | |
+ "cmp %w2, %w3\n\t" \ | |
+ "csel %w1, %w3, %w1, lo\n\t" \ | |
+ "csetm %w0, lo" \ | |
+ : "=&r"(tmp), "+&r"(rc.range) \ | |
+ : "r"(rc.code), "r"(rc_bound) \ | |
+ : "cc" \ | |
+ ); \ | |
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ | |
+ prob -= cache >> RC_MOVE_BITS; \ | |
+ pre0; tmp = ~tmp; pre1; \ | |
+ rc.code -= tmp & rc_bound; \ | |
+ if (!tmp) { action0; } else { action1; }; \ | |
+} while (0) | |
+#endif | |
+#endif | |
+#ifdef rc_bit_nobranch | |
+#define rc_bit(prob, action0, action1, seq) \ | |
+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ | |
+ action0, action1, seq) | |
+#define rc_bit_matched(prob, seq) \ | |
+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \ | |
+ symbol = (symbol << 1) - tmp, , , seq) | |
+#define rc_bit_dist(prob, offset, seq) \ | |
+ rc_bit_nobranch(prob, , \ | |
+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \ | |
+ , , seq); | |
+#define rc_bit_dist_last(prob, offset, seq) \ | |
+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq); | |
+#else | |
#define rc_bit(prob, action0, action1, seq) \ | |
rc_bit_last(prob, \ | |
symbol <<= 1; action0, \ | |
symbol = (symbol << 1) + 1; action1, \ | |
seq); | |
+#define rc_bit_matched(prob, seq) \ | |
+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) | |
+#define rc_bit_dist(prob, offset, seq) \ | |
+ rc_bit(prob, , rep0 += offset, seq); | |
+#define rc_bit_dist_last(prob, offset, seq) \ | |
+ rc_bit_last(prob, , rep0 += offset, seq) | |
+#endif | |
/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled | |
-- | |
2.17.1 |
@amonakov, yes it is a good archive for testing binary data decompression performance.
export LD_LIBRARY_PATH=build/src/liblzma/.libs
perf stat -r 5 build/src/xz/.libs/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
So, on Skylake before:
8 173,04 msec task-clock:u # 1,000 CPUs utilized ( +- 0,03% )
0 context-switches:u # 0,000 K/sec
0 cpu-migrations:u # 0,000 K/sec
16 467 page-faults:u # 0,002 M/sec ( +- 0,00% )
30 598 206 585 cycles:u # 3,744 GHz ( +- 0,02% )
34 721 982 623 instructions:u # 1,13 insn per cycle ( +- 0,00% )
4 194 995 085 branches:u # 513,272 M/sec ( +- 0,00% )
556 648 934 branch-misses:u # 13,27% of all branches ( +- 0,00% )
8,17348 +- 0,00210 seconds time elapsed ( +- 0,03% )
And after:
6 347,72 msec task-clock:u # 1,000 CPUs utilized ( +- 0,02% )
0 context-switches:u # 0,000 K/sec
0 cpu-migrations:u # 0,000 K/sec
16 468 page-faults:u # 0,003 M/sec ( +- 0,00% )
23 685 264 720 cycles:u # 3,731 GHz ( +- 0,02% )
39 467 984 353 instructions:u # 1,67 insn per cycle ( +- 0,00% )
2 941 144 012 branches:u # 463,338 M/sec ( +- 0,00% )
165 593 141 branch-misses:u # 5,63% of all branches ( +- 0,09% )
6,348644 +- 0,000952 seconds time elapsed ( +- 0,01% )
Speedup by almost 29%.
Intel Celeron 1007U (Ivy Bridge)
linux-5.15.7.tar.xz : 19.33 --> 18.07 (+7%)
linux-firmware-20211027.tar.xz : 20.57 --> 17.76 (+16%)
Ryzen 4500U (Zen 2)
Before:
Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):
11,907.43 msec task-clock:u # 1.000 CPUs utilized ( +- 0.03% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
17,240 page-faults:u # 1.448 K/sec ( +- 0.02% )
27,992,949,586 cycles:u # 2.351 GHz ( +- 0.04% )
583,797,051 stalled-cycles-frontend:u # 2.09% frontend cycles idle ( +- 0.06% )
3,412,760,275 stalled-cycles-backend:u # 12.19% backend cycles idle ( +- 0.36% )
35,183,073,831 instructions:u # 1.26 insn per cycle
# 0.10 stalled cycles per insn ( +- 0.00% )
4,182,899,643 branches:u # 351.285 M/sec ( +- 0.00% )
531,756,997 branch-misses:u # 12.71% of all branches ( +- 0.01% )
11.90746 +- 0.00389 seconds time elapsed ( +- 0.03% )
After:
Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):
9,797.98 msec task-clock:u # 1.000 CPUs utilized ( +- 0.11% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
17,247 page-faults:u # 1.760 K/sec ( +- 0.02% )
23,012,154,561 cycles:u # 2.349 GHz ( +- 0.02% )
173,975,497 stalled-cycles-frontend:u # 0.76% frontend cycles idle ( +- 0.09% )
12,818,655,196 stalled-cycles-backend:u # 55.70% backend cycles idle ( +- 0.02% )
39,198,156,956 instructions:u # 1.70 insn per cycle
# 0.33 stalled cycles per insn ( +- 0.00% )
2,970,714,992 branches:u # 303.197 M/sec ( +- 0.00% )
156,064,253 branch-misses:u # 5.25% of all branches ( +- 0.01% )
9.7976 +- 0.0107 seconds time elapsed ( +- 0.11% )
Did a minor update to cover the rc_bit() macro, which is used when XZ is configured with the --enable-small
option.
... and I did the same for AArch64, using CSEL instruction. (patch updated)
But I only have Allwinner H616 (Cortex-A53) for testing:
linux-5.15.7.tar.xz : 25.12 --> 23.85 (+5%)
linux-firmware-20211027.tar.xz : 22.80 --> 21.10 (+8%)
AArch64, AWS Graviton2 (c6g.xlarge
size):
Before:
Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):
8751.09 msec task-clock # 1.000 CPUs utilized ( +- 0.02% )
44 context-switches # 0.005 K/sec ( +- 5.47% )
0 cpu-migrations # 0.000 K/sec ( +- 61.24% )
18112 page-faults # 0.002 M/sec ( +- 0.05% )
21832211556 cycles # 2.495 GHz ( +- 0.02% )
30273778750 instructions # 1.39 insn per cycle ( +- 0.01% )
<not supported> branches
535339600 branch-misses ( +- 0.01% )
8.75277 +- 0.00198 seconds time elapsed ( +- 0.02% )
After:
Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):
8111.05 msec task-clock # 1.000 CPUs utilized ( +- 0.01% )
48 context-switches # 0.006 K/sec ( +- 2.54% )
1 cpu-migrations # 0.000 K/sec ( +- 31.62% )
18088 page-faults # 0.002 M/sec ( +- 0.01% )
20232553634 cycles # 2.494 GHz ( +- 0.01% )
34157801215 instructions # 1.69 insn per cycle ( +- 0.00% )
<not supported> branches
159725908 branch-misses ( +- 0.02% )
8.112706 +- 0.000784 seconds time elapsed ( +- 0.01% )
~7.9% speedup.
Results from ALT Linux team:
Comet Lake:
linux-5.15.8.tar.xz : 5.81 --> 5.30 (+9.6%)
linux-firmware-20211027.tar.xz : 6.14 --> 5.31 (+15.6%)
Intel Xeon (Ivy Bridge EP):
linux-5.15.8.tar.xz : 7.987+-0.363 --> 7.128+-0.288 ( 10.75+-5.80 % )
linux-firmware-20211027.tar.xz : 8.552+-0.354 --> 7.059+-0.310 ( 17.46+-5.51 % )
I wrote scripts to make testing easier:
# downloads test files and builds XZ
./build.sh
# tests decompression speed
./bench.sh ref
./bench.sh new
build.sh
#!/bin/bash
set -e
# patch
[ -f faster_lzma_decoder_x86.patch ] || wget https://gist.githubusercontent.com/ilyakurdyukov/f514418f3affd677e1ac408ec0c09188/raw/faster_lzma_decoder_x86.patch
# test data
[ -f linux-5.15.7.tar.xz ] || wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.7.tar.xz
[ -f linux-firmware-20211027.tar.xz ] || wget https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz
[ -d xz ] || git clone http://git.tukaani.org/xz.git
mkdir -p ref
mkdir -p new
# build ref
cp -r xz build
cd build
# or cmake as an option
./autogen.sh --no-po4a
./configure
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../ref/
# build new
patch -p1 < ../faster_lzma_decoder_x86.patch
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../new/
cd ..
bench.sh
#!/bin/bash
dir=${1:-"ref"}
export LD_LIBRARY_PATH=$(pwd)/$dir
# to make sure the correct liblzma is loaded
ldd $dir/xz | grep liblzma
# or "perf stat -r 5" instead of "for" and "time -p"
for i in 1 2 3; do
echo "# ($dir:$i) linux-5.15.7"
time -p $dir/xz -c -d linux-5.15.7.tar.xz > /dev/null
echo "# ($dir:$i) linux-firmware-20211027"
time -p $dir/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
done
Kunpeng-920 (AArch64)
linux-5.15.7.tar.xz : 8.82 --> 8.78 (+0.4%)
linux-firmware-20211027.tar.xz : 8.97 --> 8.44 (+6.2%)
I got a report that this patch is slowing decoding on the Apple M1, but I am unable to investigate this on my own.
Well, at least it will be easy to exclude it for Apple M1 by adding some #ifndef.
I got closer to 9% on Ivybridge (successor of Sandybridge uarch) on the linux-5.15.tar.xz archive; don't have
perf
on that machine at the moment, unfortunately.On Atom I guess the misprediction penalty is a bit smaller, and, further, the significant increase in instruction count is eating some of the benefit of reduced mispredictions?
I don't mind running additional tests on binary data, we just need to select some common archive to benchmark on (something that anyone could retrieve). How about the 175MiB linux-firmware archive at https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz ?