ilyakurdyukov · June 18, 2023 17:04 · Dec 14, 2021 · Dec 14, 2021 · Dec 13, 2021 · Dec 13, 2021
diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -1,6 +1,6 @@
-From f0d2f989c249605e8c951b6770d2c05317fa8114 Mon Sep 17 00:00:00 2001
+From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Tue, 14 Dec 2021 18:37:22 +0700
+Date: Tue, 14 Dec 2021 21:54:32 +0700
 Subject: [PATCH] faster lzma_decoder for x86
 
 Notice: Uses inline assembly with CMOV instruction.
@@ -9,8 +9,8 @@ Another change that removes the comparison with in_size can give a few
 percent speedup for architectures with a small number of registers.
 ---
  src/liblzma/lzma/lzma_decoder.c        | 78 +++++++++++++-------------
- src/liblzma/rangecoder/range_decoder.h | 51 +++++++++++++++--
- 2 files changed, 86 insertions(+), 43 deletions(-)
+ src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
+ 2 files changed, 113 insertions(+), 43 deletions(-)
 
 diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
 index e605a0a..ecb804b 100644
@@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644
 
  					if (rep0 == UINT32_MAX) {
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
-index e0b051f..4bd78dd 100644
+index e0b051f..cc9ac35 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
@@ -196,10 +196,11 @@ index e0b051f..4bd78dd 100644
  	} \
  } while (0)
 
-@@ -151,11 +153,52 @@ do { \
+@@ -151,11 +153,79 @@ do { \
 
  /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
  /// on the decoded bit.
++#ifndef NO_BRANCH_OPT
 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
 +do { \
@@ -214,13 +215,39 @@ index e0b051f..4bd78dd 100644
 +		"sbbl %0, %0" \
 +		: "=&r"(tmp), "+&r"(rc.range) \
 +		: "r"(rc.code), "r"(rc_bound) \
++		: "flags" \
 +	); \
 +	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
 +	prob -= cache >> RC_MOVE_BITS; \
 +	pre0; tmp = ~tmp; pre1; \
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
++#elif defined(__GNUC__) && defined(__aarch64__)
++#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
++do { \
++	rc_normalize(seq); \
++	uint32_t cache = (prob), tmp; \
++	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
++	rc.range -= rc_bound; \
++	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
++	__asm__ ( \
++		"cmp %w2, %w3\n\t" \
++		"csel %w1, %w3, %w1, lo\n\t" \
++		"csetm %w0, lo" \
++		: "=&r"(tmp), "+&r"(rc.range) \
++		: "r"(rc.code), "r"(rc_bound) \
++		: "cc" \
++	); \
++	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
++	prob -= cache >> RC_MOVE_BITS; \
++	pre0; tmp = ~tmp; pre1; \
++	rc.code -= tmp & rc_bound; \
++	if (!tmp) { action0; } else { action1; }; \
++} while (0)
++#endif
++#endif
++#ifdef rc_bit_nobranch
 +#define rc_bit(prob, action0, action1, seq) \
 +	rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
 +		action0, action1, seq)

diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -1,6 +1,6 @@
-From db71be5c2f77ae18afc73bba85b9c11c6624d243 Mon Sep 17 00:00:00 2001
+From f0d2f989c249605e8c951b6770d2c05317fa8114 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Mon, 13 Dec 2021 21:25:54 +0700
+Date: Tue, 14 Dec 2021 18:37:22 +0700
 Subject: [PATCH] faster lzma_decoder for x86
 
 Notice: Uses inline assembly with CMOV instruction.
@@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644
 
  					if (rep0 == UINT32_MAX) {
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
-index e0b051f..baec45c 100644
+index e0b051f..4bd78dd 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
@@ -196,10 +196,10 @@ index e0b051f..baec45c 100644
  	} \
  } while (0)
 
-@@ -163,8 +165,49 @@ do { \
- /// statements. On the other hand this also makes it less readable, since
- /// spotting the places where the decoder loop may be restarted is less
- /// obvious.
+@@ -151,11 +153,52 @@ do { \
+
+ /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
+ /// on the decoded bit.
 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
 +do { \
@@ -221,8 +221,8 @@ index e0b051f..baec45c 100644
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
-+#define rc_bit_case(prob, action0, action1, seq) \
-+	case seq: rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
++#define rc_bit(prob, action0, action1, seq) \
++	rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
 +		action0, action1, seq)
 +#define rc_bit_matched(prob, seq) \
 +	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
@@ -234,8 +234,11 @@ index e0b051f..baec45c 100644
 +#define rc_bit_dist_last(prob, offset, seq) \
 +	rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
 +#else
- #define rc_bit_case(prob, action0, action1, seq) \
- 	case seq: rc_bit(prob, action0, action1, seq)
+ #define rc_bit(prob, action0, action1, seq) \
+ 	rc_bit_last(prob, \
+ 		symbol <<= 1; action0, \
+ 		symbol = (symbol << 1) + 1; action1, \
+ 		seq);
 +#define rc_bit_matched(prob, seq) \
 +	rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
 +#define rc_bit_dist(prob, offset, seq) \
@@ -245,6 +248,6 @@ index e0b051f..baec45c 100644
 +#endif
 
 
- /// Decode a bit without using a probability.
+ /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
 -- 
 2.17.1
diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -1,19 +1,19 @@
-From 32a00b809c5989a196a8045cca7c82d4dcec89a8 Mon Sep 17 00:00:00 2001
+From db71be5c2f77ae18afc73bba85b9c11c6624d243 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Mon, 13 Dec 2021 19:34:03 +0700
+Date: Mon, 13 Dec 2021 21:25:54 +0700
 Subject: [PATCH] faster lzma_decoder for x86
 
 Notice: Uses inline assembly with CMOV instruction.
 
 Another change that removes the comparison with in_size can give a few
 percent speedup for architectures with a small number of registers.
 ---
- src/liblzma/lzma/lzma_decoder.c        |  9 ++----
- src/liblzma/rangecoder/range_decoder.h | 41 +++++++++++++++++++++++---
- 2 files changed, 39 insertions(+), 11 deletions(-)
+ src/liblzma/lzma/lzma_decoder.c        | 78 +++++++++++++-------------
+ src/liblzma/rangecoder/range_decoder.h | 51 +++++++++++++++--
+ 2 files changed, 86 insertions(+), 43 deletions(-)
 
 diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
-index e605a0a..a11e2f7 100644
+index e605a0a..ecb804b 100644
 --- a/src/liblzma/lzma/lzma_decoder.c
 +++ b/src/liblzma/lzma/lzma_decoder.c
 @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
@@ -39,8 +39,126 @@ index e605a0a..a11e2f7 100644
 
  				d(SEQ_LITERAL_MATCHED0);
  				len <<= 1;
+@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
+ 					probs = coder->pos_special + rep0
+ 							- symbol - 1;
+ 					symbol = 1;
+-					offset = 0;
+-	case SEQ_DIST_MODEL:
++					offset = 1;
+ #ifdef HAVE_SMALL
++					limit = 1 << limit;
++	case SEQ_DIST_MODEL:
+ 					do {
+-						rc_bit(probs[symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+-					} while (++offset < limit);
++						offset <<= 1;
++					} while (offset < limit);
+ #else
++	case SEQ_DIST_MODEL:
+ 					switch (limit) {
+ 					case 5:
+ 						assert(offset == 0);
+-						rc_bit(probs[symbol], ,
+-							rep0 += 1U,
++						rc_bit_dist(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+-						++offset;
++						offset <<= 1;
+ 						--limit;
+ 					case 4:
+-						rc_bit(probs[symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+-						++offset;
++						offset <<= 1;
+ 						--limit;
+ 					case 3:
+-						rc_bit(probs[symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+-						++offset;
++						offset <<= 1;
+ 						--limit;
+ 					case 2:
+-						rc_bit(probs[symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+-						++offset;
++						offset <<= 1;
+ 						--limit;
+ 					case 1:
+ 						// We need "symbol" only for
+@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
+ 						// rc_bit_last() here to omit
+ 						// the unneeded updating of
+ 						// "symbol".
+-						rc_bit_last(probs[symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist_last(probs[symbol],
++							offset,
+ 							SEQ_DIST_MODEL);
+ 					}
+ #endif
+@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
+ 					rep0 <<= ALIGN_BITS;
+ 					symbol = 1;
+ #ifdef HAVE_SMALL
+-					offset = 0;
++					offset = 1;
+ 	case SEQ_ALIGN:
+ 					do {
+-						rc_bit(coder->pos_align[
+-								symbol], ,
+-							rep0 += 1U << offset,
++						rc_bit_dist(coder->pos_align[
++								symbol],
++							offset,
+ 							SEQ_ALIGN);
+-					} while (++offset < ALIGN_BITS);
++						offset <<= 1;
++					} while (offset < (1 << ALIGN_BITS));
+ #else
+ 	case SEQ_ALIGN0:
+-					rc_bit(coder->pos_align[symbol], ,
+-							rep0 += 1, SEQ_ALIGN0);
++					rc_bit_dist(coder->pos_align[symbol],
++							1, SEQ_ALIGN0);
+ 	case SEQ_ALIGN1:
+-					rc_bit(coder->pos_align[symbol], ,
+-							rep0 += 2, SEQ_ALIGN1);
++					rc_bit_dist(coder->pos_align[symbol],
++							2, SEQ_ALIGN1);
+ 	case SEQ_ALIGN2:
+-					rc_bit(coder->pos_align[symbol], ,
+-							rep0 += 4, SEQ_ALIGN2);
++					rc_bit_dist(coder->pos_align[symbol],
++							4, SEQ_ALIGN2);
+ 	case SEQ_ALIGN3:
+ 					// Like in SEQ_DIST_MODEL, we don't
+ 					// need "symbol" for anything else
+ 					// than indexing the probability array.
+-					rc_bit_last(coder->pos_align[symbol], ,
+-							rep0 += 8, SEQ_ALIGN3);
++					rc_bit_dist_last(coder->pos_align[
++								symbol],
++							8, SEQ_ALIGN3);
+ #endif
+
+ 					if (rep0 == UINT32_MAX) {
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
-index e0b051f..e4a2b25 100644
+index e0b051f..baec45c 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
@@ -78,12 +196,12 @@ index e0b051f..e4a2b25 100644
  	} \
  } while (0)
 
-@@ -163,8 +165,39 @@ do { \
+@@ -163,8 +165,49 @@ do { \
  /// statements. On the other hand this also makes it less readable, since
  /// spotting the places where the decoder loop may be restarted is less
  /// obvious.
 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-+#define rc_bit_nobranch(prob, pre_action, action0, action1, seq) \
++#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
 +do { \
 +	rc_normalize(seq); \
 +	uint32_t cache = (prob), tmp; \
@@ -99,21 +217,31 @@ index e0b051f..e4a2b25 100644
 +	); \
 +	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
 +	prob -= cache >> RC_MOVE_BITS; \
-+	pre_action; \
-+	tmp = ~tmp; \
-+	symbol = (symbol << 1) - tmp; \
++	pre0; tmp = ~tmp; pre1; \
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
 +#define rc_bit_case(prob, action0, action1, seq) \
-+	case seq: rc_bit_nobranch(prob, , action0, action1, seq)
++	case seq: rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
++		action0, action1, seq)
 +#define rc_bit_matched(prob, seq) \
-+	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, , , seq)
++	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
++		symbol = (symbol << 1) - tmp, , , seq)
++#define rc_bit_dist(prob, offset, seq) \
++	rc_bit_nobranch(prob, , \
++		symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
++		, , seq);
++#define rc_bit_dist_last(prob, offset, seq) \
++	rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
 +#else
  #define rc_bit_case(prob, action0, action1, seq) \
  	case seq: rc_bit(prob, action0, action1, seq)
 +#define rc_bit_matched(prob, seq) \
 +	rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
++#define rc_bit_dist(prob, offset, seq) \
++	rc_bit(prob, , rep0 += offset, seq);
++#define rc_bit_dist_last(prob, offset, seq) \
++	rc_bit_last(prob, , rep0 += offset, seq)
 +#endif
 
 

diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -1,26 +1,54 @@
-From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
+From 32a00b809c5989a196a8045cca7c82d4dcec89a8 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Mon, 13 Dec 2021 16:19:34 +0700
+Date: Mon, 13 Dec 2021 19:34:03 +0700
 Subject: [PATCH] faster lzma_decoder for x86
 
 Notice: Uses inline assembly with CMOV instruction.
 
 Another change that removes the comparison with in_size can give a few
 percent speedup for architectures with a small number of registers.
 ---
- src/liblzma/rangecoder/range_decoder.h | 35 +++++++++++++++++++++++---
- 1 file changed, 31 insertions(+), 4 deletions(-)
+ src/liblzma/lzma/lzma_decoder.c        |  9 ++----
+ src/liblzma/rangecoder/range_decoder.h | 41 +++++++++++++++++++++++---
+ 2 files changed, 39 insertions(+), 11 deletions(-)
 
+diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
+index e605a0a..a11e2f7 100644
+--- a/src/liblzma/lzma/lzma_decoder.c
++++ b/src/liblzma/lzma/lzma_decoder.c
+@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
+ 							= offset + match_bit
+ 							+ symbol;
+
+-					rc_bit(probs[subcoder_index],
+-							offset &= ~match_bit,
+-							offset &= match_bit,
++					rc_bit_matched(probs[subcoder_index],
+ 							SEQ_LITERAL_MATCHED);
+
+ 					// It seems to be faster to do this
+@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
+ 		case seq: \
+ 			match_bit = len & offset; \
+ 			subcoder_index = offset + match_bit + symbol; \
+-			rc_bit(probs[subcoder_index], \
+-					offset &= ~match_bit, \
+-					offset &= match_bit, \
+-					seq)
++			rc_bit_matched(probs[subcoder_index], seq)
+
+ 				d(SEQ_LITERAL_MATCHED0);
+ 				len <<= 1;
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
-index e0b051f..0c3d140 100644
+index e0b051f..e4a2b25 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
  /// variables `in' and `in_size' to be defined.
  #define rc_to_local(range_decoder, in_pos) \
  	lzma_range_decoder rc = range_decoder; \
 -	size_t rc_in_pos = (in_pos); \
-+	size_t rc_in_pos = (in_pos) - in_size; \
++	size_t rc_in_pos = in_pos - in_size; \
 +	const uint8_t *restrict rc_end = in + in_size; \
  	uint32_t rc_bound
 
@@ -50,20 +78,19 @@ index e0b051f..0c3d140 100644
  	} \
  } while (0)
 
-@@ -163,8 +165,33 @@ do { \
+@@ -163,8 +165,39 @@ do { \
  /// statements. On the other hand this also makes it less readable, since
  /// spotting the places where the decoder loop may be restarted is less
  /// obvious.
 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-+#define rc_bit_case(prob, action0, action1, seq) \
-+case seq: \
++#define rc_bit_nobranch(prob, pre_action, action0, action1, seq) \
 +do { \
 +	rc_normalize(seq); \
 +	uint32_t cache = (prob), tmp; \
 +	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
 +	rc.range -= rc_bound; \
 +	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
-+	asm( \
++	__asm__ ( \
 +		"cmpl %3, %2\n\t" \
 +		"cmovbl %3, %1\n\t" \
 +		"sbbl %0, %0" \
@@ -72,14 +99,21 @@ index e0b051f..0c3d140 100644
 +	); \
 +	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
 +	prob -= cache >> RC_MOVE_BITS; \
++	pre_action; \
 +	tmp = ~tmp; \
 +	symbol = (symbol << 1) - tmp; \
 +	rc.code -= tmp & rc_bound; \
 +	if (!tmp) { action0; } else { action1; }; \
 +} while (0)
++#define rc_bit_case(prob, action0, action1, seq) \
++	case seq: rc_bit_nobranch(prob, , action0, action1, seq)
++#define rc_bit_matched(prob, seq) \
++	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, , , seq)
 +#else
  #define rc_bit_case(prob, action0, action1, seq) \
  	case seq: rc_bit(prob, action0, action1, seq)
++#define rc_bit_matched(prob, seq) \
++	rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
 +#endif
 
 

diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -1,6 +1,6 @@
 From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
 From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Mon, 13 Dec 2021 15:58:46 +0700
+Date: Mon, 13 Dec 2021 16:19:34 +0700
 Subject: [PATCH] faster lzma_decoder for x86
 
 Notice: Uses inline assembly with CMOV instruction.
@@ -12,7 +12,7 @@ percent speedup for architectures with a small number of registers.
  1 file changed, 31 insertions(+), 4 deletions(-)
 
 diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
-index e0b051f..a50523a 100644
+index e0b051f..0c3d140 100644
 --- a/src/liblzma/rangecoder/range_decoder.h
 +++ b/src/liblzma/rangecoder/range_decoder.h
 @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
@@ -54,7 +54,7 @@ index e0b051f..a50523a 100644
  /// statements. On the other hand this also makes it less readable, since
  /// spotting the places where the decoder loop may be restarted is less
  /// obvious.
-+#if defined(__GNUC__) && (defined(__x86__) || defined(__x86_64__))
++#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 +#define rc_bit_case(prob, action0, action1, seq) \
 +case seq: \
 +do { \

diff --git a/faster_lzma_decoder_x86.patch b/faster_lzma_decoder_x86.patch
@@ -0,0 +1,88 @@
+From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
+From: Ilya Kurdyukov <jpegqs@gmail.com>
+Date: Mon, 13 Dec 2021 15:58:46 +0700
+Subject: [PATCH] faster lzma_decoder for x86
+
+Notice: Uses inline assembly with CMOV instruction.
+
+Another change that removes the comparison with in_size can give a few
+percent speedup for architectures with a small number of registers.
+---
+ src/liblzma/rangecoder/range_decoder.h | 35 +++++++++++++++++++++++---
+ 1 file changed, 31 insertions(+), 4 deletions(-)
+
+diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
+index e0b051f..a50523a 100644
+--- a/src/liblzma/rangecoder/range_decoder.h
++++ b/src/liblzma/rangecoder/range_decoder.h
+@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
+ /// variables `in' and `in_size' to be defined.
+ #define rc_to_local(range_decoder, in_pos) \
+ 	lzma_range_decoder rc = range_decoder; \
+-	size_t rc_in_pos = (in_pos); \
++	size_t rc_in_pos = (in_pos) - in_size; \
++	const uint8_t *restrict rc_end = in + in_size; \
+ 	uint32_t rc_bound
+
+
+@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
+ #define rc_from_local(range_decoder, in_pos) \
+ do { \
+ 	range_decoder = rc; \
+-	in_pos = rc_in_pos; \
++	in_pos = rc_in_pos + in_size; \
+ } while (0)
+
+
+@@ -87,12 +88,13 @@ do { \
+ #define rc_normalize(seq) \
+ do { \
+ 	if (rc.range < RC_TOP_VALUE) { \
+-		if (unlikely(rc_in_pos == in_size)) { \
++		if (unlikely(!rc_in_pos)) { \
+ 			coder->sequence = seq; \
+ 			goto out; \
+ 		} \
++		rc.code <<= RC_SHIFT_BITS; \
+ 		rc.range <<= RC_SHIFT_BITS; \
+-		rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
++		rc.code |= rc_end[rc_in_pos++]; \
+ 	} \
+ } while (0)
+
+@@ -163,8 +165,33 @@ do { \
+ /// statements. On the other hand this also makes it less readable, since
+ /// spotting the places where the decoder loop may be restarted is less
+ /// obvious.
++#if defined(__GNUC__) && (defined(__x86__) || defined(__x86_64__))
++#define rc_bit_case(prob, action0, action1, seq) \
++case seq: \
++do { \
++	rc_normalize(seq); \
++	uint32_t cache = (prob), tmp; \
++	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
++	rc.range -= rc_bound; \
++	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
++	asm( \
++		"cmpl %3, %2\n\t" \
++		"cmovbl %3, %1\n\t" \
++		"sbbl %0, %0" \
++		: "=&r"(tmp), "+&r"(rc.range) \
++		: "r"(rc.code), "r"(rc_bound) \
++	); \
++	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
++	prob -= cache >> RC_MOVE_BITS; \
++	tmp = ~tmp; \
++	symbol = (symbol << 1) - tmp; \
++	rc.code -= tmp & rc_bound; \
++	if (!tmp) { action0; } else { action1; }; \
++} while (0)
++#else
+ #define rc_bit_case(prob, action0, action1, seq) \
+ 	case seq: rc_bit(prob, action0, action1, seq)
++#endif
+
+
+ /// Decode a bit without using a probability.
+-- 
+2.17.1
No results found