Last active
June 18, 2023 17:04
-
-
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Revisions
-
ilyakurdyukov revised this gist
Dec 14, 2021 . 1 changed file with 33 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Tue, 14 Dec 2021 21:54:32 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. @@ -9,8 +9,8 @@ Another change that removes the comparison with in_size can give a few percent speedup for architectures with a small number of registers. --- src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++------------- src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 43 deletions(-) diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c index e605a0a..ecb804b 100644 @@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644 if (rep0 == UINT32_MAX) { diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..cc9ac35 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, @@ -196,10 +196,11 @@ index e0b051f..4bd78dd 100644 } \ } while (0) @@ -151,11 +153,79 @@ do { \ /// Decodes one bit, updates "symbol", and runs action0 or action1 depending /// on the decoded bit. +#ifndef NO_BRANCH_OPT +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ +do { \ @@ -214,13 +215,39 @@ index e0b051f..4bd78dd 100644 + "sbbl %0, %0" \ + : "=&r"(tmp), "+&r"(rc.range) \ + : "r"(rc.code), "r"(rc_bound) \ + : "flags" \ + ); \ + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ + prob -= cache >> RC_MOVE_BITS; \ + pre0; tmp = ~tmp; pre1; \ + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#elif defined(__GNUC__) && defined(__aarch64__) +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ +do { \ + rc_normalize(seq); \ + uint32_t cache = (prob), tmp; \ + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ + rc.range -= rc_bound; \ + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ + __asm__ ( \ + "cmp %w2, %w3\n\t" \ + "csel %w1, %w3, %w1, lo\n\t" \ + "csetm %w0, lo" \ + : "=&r"(tmp), "+&r"(rc.range) \ + : "r"(rc.code), "r"(rc_bound) \ + : "cc" \ + ); \ + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ + prob -= cache >> RC_MOVE_BITS; \ + pre0; tmp = ~tmp; pre1; \ + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#endif +#endif +#ifdef rc_bit_nobranch +#define rc_bit(prob, action0, action1, seq) \ + rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ + action0, action1, seq) -
ilyakurdyukov revised this gist
Dec 14, 2021 . 1 changed file with 15 additions and 12 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ From f0d2f989c249605e8c951b6770d2c05317fa8114 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Tue, 14 Dec 2021 18:37:22 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. @@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644 if (rep0 == UINT32_MAX) { diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..4bd78dd 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, @@ -196,10 +196,10 @@ index e0b051f..baec45c 100644 } \ } while (0) @@ -151,11 +153,52 @@ do { \ /// Decodes one bit, updates "symbol", and runs action0 or action1 depending /// on the decoded bit. +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ +do { \ @@ -221,8 +221,8 @@ index e0b051f..baec45c 100644 + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#define rc_bit(prob, action0, action1, seq) \ + rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ + action0, action1, seq) +#define rc_bit_matched(prob, seq) \ + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \ @@ -234,8 +234,11 @@ index e0b051f..baec45c 100644 +#define rc_bit_dist_last(prob, offset, seq) \ + rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq); +#else #define rc_bit(prob, action0, action1, seq) \ rc_bit_last(prob, \ symbol <<= 1; action0, \ symbol = (symbol << 1) + 1; action1, \ seq); +#define rc_bit_matched(prob, seq) \ + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) +#define rc_bit_dist(prob, offset, seq) \ @@ -245,6 +248,6 @@ index e0b051f..baec45c 100644 +#endif /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled -- 2.17.1 -
ilyakurdyukov revised this gist
Dec 13, 2021 . No changes.There are no files selected for viewing
-
ilyakurdyukov revised this gist
Dec 13, 2021 . 1 changed file with 142 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,19 +1,19 @@ From db71be5c2f77ae18afc73bba85b9c11c6624d243 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Mon, 13 Dec 2021 21:25:54 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. Another change that removes the comparison with in_size can give a few percent speedup for architectures with a small number of registers. --- src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++------------- src/liblzma/rangecoder/range_decoder.h | 51 +++++++++++++++-- 2 files changed, 86 insertions(+), 43 deletions(-) diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c index e605a0a..ecb804b 100644 --- a/src/liblzma/lzma/lzma_decoder.c +++ b/src/liblzma/lzma/lzma_decoder.c @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, @@ -39,8 +39,126 @@ index e605a0a..a11e2f7 100644 d(SEQ_LITERAL_MATCHED0); len <<= 1; @@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, probs = coder->pos_special + rep0 - symbol - 1; symbol = 1; - offset = 0; - case SEQ_DIST_MODEL: + offset = 1; #ifdef HAVE_SMALL + limit = 1 << limit; + case SEQ_DIST_MODEL: do { - rc_bit(probs[symbol], , - rep0 += 1U << offset, + rc_bit_dist(probs[symbol], + offset, SEQ_DIST_MODEL); - } while (++offset < limit); + offset <<= 1; + } while (offset < limit); #else + case SEQ_DIST_MODEL: switch (limit) { case 5: assert(offset == 0); - rc_bit(probs[symbol], , - rep0 += 1U, + rc_bit_dist(probs[symbol], + offset, SEQ_DIST_MODEL); - ++offset; + offset <<= 1; --limit; case 4: - rc_bit(probs[symbol], , - rep0 += 1U << offset, + rc_bit_dist(probs[symbol], + offset, SEQ_DIST_MODEL); - ++offset; + offset <<= 1; --limit; case 3: - rc_bit(probs[symbol], , - rep0 += 1U << offset, + rc_bit_dist(probs[symbol], + offset, SEQ_DIST_MODEL); - ++offset; + offset <<= 1; --limit; case 2: - rc_bit(probs[symbol], , - rep0 += 1U << offset, + rc_bit_dist(probs[symbol], + offset, SEQ_DIST_MODEL); - ++offset; + offset <<= 1; --limit; case 1: // We need "symbol" only for @@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, // rc_bit_last() here to omit // the unneeded updating of // "symbol". - rc_bit_last(probs[symbol], , - rep0 += 1U << offset, + rc_bit_dist_last(probs[symbol], + offset, SEQ_DIST_MODEL); } #endif @@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, rep0 <<= ALIGN_BITS; symbol = 1; #ifdef HAVE_SMALL - offset = 0; + offset = 1; case SEQ_ALIGN: do { - rc_bit(coder->pos_align[ - symbol], , - rep0 += 1U << offset, + rc_bit_dist(coder->pos_align[ + symbol], + offset, SEQ_ALIGN); - } while (++offset < ALIGN_BITS); + offset <<= 1; + } while (offset < (1 << ALIGN_BITS)); #else case SEQ_ALIGN0: - rc_bit(coder->pos_align[symbol], , - rep0 += 1, SEQ_ALIGN0); + rc_bit_dist(coder->pos_align[symbol], + 1, SEQ_ALIGN0); case SEQ_ALIGN1: - rc_bit(coder->pos_align[symbol], , - rep0 += 2, SEQ_ALIGN1); + rc_bit_dist(coder->pos_align[symbol], + 2, SEQ_ALIGN1); case SEQ_ALIGN2: - rc_bit(coder->pos_align[symbol], , - rep0 += 4, SEQ_ALIGN2); + rc_bit_dist(coder->pos_align[symbol], + 4, SEQ_ALIGN2); case SEQ_ALIGN3: // Like in SEQ_DIST_MODEL, we don't // need "symbol" for anything else // than indexing the probability array. - rc_bit_last(coder->pos_align[symbol], , - rep0 += 8, SEQ_ALIGN3); + rc_bit_dist_last(coder->pos_align[ + symbol], + 8, SEQ_ALIGN3); #endif if (rep0 == UINT32_MAX) { diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..baec45c 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, @@ -78,12 +196,12 @@ index e0b051f..e4a2b25 100644 } \ } while (0) @@ -163,8 +165,49 @@ do { \ /// statements. On the other hand this also makes it less readable, since /// spotting the places where the decoder loop may be restarted is less /// obvious. +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \ +do { \ + rc_normalize(seq); \ + uint32_t cache = (prob), tmp; \ @@ -99,21 +217,31 @@ index e0b051f..e4a2b25 100644 + ); \ + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ + prob -= cache >> RC_MOVE_BITS; \ + pre0; tmp = ~tmp; pre1; \ + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#define rc_bit_case(prob, action0, action1, seq) \ + case seq: rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \ + action0, action1, seq) +#define rc_bit_matched(prob, seq) \ + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \ + symbol = (symbol << 1) - tmp, , , seq) +#define rc_bit_dist(prob, offset, seq) \ + rc_bit_nobranch(prob, , \ + symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \ + , , seq); +#define rc_bit_dist_last(prob, offset, seq) \ + rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq); +#else #define rc_bit_case(prob, action0, action1, seq) \ case seq: rc_bit(prob, action0, action1, seq) +#define rc_bit_matched(prob, seq) \ + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) +#define rc_bit_dist(prob, offset, seq) \ + rc_bit(prob, , rep0 += offset, seq); +#define rc_bit_dist_last(prob, offset, seq) \ + rc_bit_last(prob, , rep0 += offset, seq) +#endif -
ilyakurdyukov revised this gist
Dec 13, 2021 . 1 changed file with 44 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,26 +1,54 @@ From 32a00b809c5989a196a8045cca7c82d4dcec89a8 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Mon, 13 Dec 2021 19:34:03 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. Another change that removes the comparison with in_size can give a few percent speedup for architectures with a small number of registers. --- src/liblzma/lzma/lzma_decoder.c | 9 ++---- src/liblzma/rangecoder/range_decoder.h | 41 +++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c index e605a0a..a11e2f7 100644 --- a/src/liblzma/lzma/lzma_decoder.c +++ b/src/liblzma/lzma/lzma_decoder.c @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, = offset + match_bit + symbol; - rc_bit(probs[subcoder_index], - offset &= ~match_bit, - offset &= match_bit, + rc_bit_matched(probs[subcoder_index], SEQ_LITERAL_MATCHED); // It seems to be faster to do this @@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, case seq: \ match_bit = len & offset; \ subcoder_index = offset + match_bit + symbol; \ - rc_bit(probs[subcoder_index], \ - offset &= ~match_bit, \ - offset &= match_bit, \ - seq) + rc_bit_matched(probs[subcoder_index], seq) d(SEQ_LITERAL_MATCHED0); len <<= 1; diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..e4a2b25 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, /// variables `in' and `in_size' to be defined. #define rc_to_local(range_decoder, in_pos) \ lzma_range_decoder rc = range_decoder; \ - size_t rc_in_pos = (in_pos); \ + size_t rc_in_pos = in_pos - in_size; \ + const uint8_t *restrict rc_end = in + in_size; \ uint32_t rc_bound @@ -50,20 +78,19 @@ index e0b051f..0c3d140 100644 } \ } while (0) @@ -163,8 +165,39 @@ do { \ /// statements. On the other hand this also makes it less readable, since /// spotting the places where the decoder loop may be restarted is less /// obvious. +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define rc_bit_nobranch(prob, pre_action, action0, action1, seq) \ +do { \ + rc_normalize(seq); \ + uint32_t cache = (prob), tmp; \ + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ + rc.range -= rc_bound; \ + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ + __asm__ ( \ + "cmpl %3, %2\n\t" \ + "cmovbl %3, %1\n\t" \ + "sbbl %0, %0" \ @@ -72,14 +99,21 @@ index e0b051f..0c3d140 100644 + ); \ + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ + prob -= cache >> RC_MOVE_BITS; \ + pre_action; \ + tmp = ~tmp; \ + symbol = (symbol << 1) - tmp; \ + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#define rc_bit_case(prob, action0, action1, seq) \ + case seq: rc_bit_nobranch(prob, , action0, action1, seq) +#define rc_bit_matched(prob, seq) \ + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, , , seq) +#else #define rc_bit_case(prob, action0, action1, seq) \ case seq: rc_bit(prob, action0, action1, seq) +#define rc_bit_matched(prob, seq) \ + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq) +#endif -
ilyakurdyukov revised this gist
Dec 13, 2021 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Mon, 13 Dec 2021 16:19:34 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. @@ -12,7 +12,7 @@ percent speedup for architectures with a small number of registers. 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..0c3d140 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, @@ -54,7 +54,7 @@ index e0b051f..a50523a 100644 /// statements. On the other hand this also makes it less readable, since /// spotting the places where the decoder loop may be restarted is less /// obvious. +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define rc_bit_case(prob, action0, action1, seq) \ +case seq: \ +do { \ -
ilyakurdyukov created this gist
Dec 13, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,88 @@ From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <jpegqs@gmail.com> Date: Mon, 13 Dec 2021 15:58:46 +0700 Subject: [PATCH] faster lzma_decoder for x86 Notice: Uses inline assembly with CMOV instruction. Another change that removes the comparison with in_size can give a few percent speedup for architectures with a small number of registers. --- src/liblzma/rangecoder/range_decoder.h | 35 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h index e0b051f..a50523a 100644 --- a/src/liblzma/rangecoder/range_decoder.h +++ b/src/liblzma/rangecoder/range_decoder.h @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, /// variables `in' and `in_size' to be defined. #define rc_to_local(range_decoder, in_pos) \ lzma_range_decoder rc = range_decoder; \ - size_t rc_in_pos = (in_pos); \ + size_t rc_in_pos = (in_pos) - in_size; \ + const uint8_t *restrict rc_end = in + in_size; \ uint32_t rc_bound @@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in, #define rc_from_local(range_decoder, in_pos) \ do { \ range_decoder = rc; \ - in_pos = rc_in_pos; \ + in_pos = rc_in_pos + in_size; \ } while (0) @@ -87,12 +88,13 @@ do { \ #define rc_normalize(seq) \ do { \ if (rc.range < RC_TOP_VALUE) { \ - if (unlikely(rc_in_pos == in_size)) { \ + if (unlikely(!rc_in_pos)) { \ coder->sequence = seq; \ goto out; \ } \ + rc.code <<= RC_SHIFT_BITS; \ rc.range <<= RC_SHIFT_BITS; \ - rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \ + rc.code |= rc_end[rc_in_pos++]; \ } \ } while (0) @@ -163,8 +165,33 @@ do { \ /// statements. On the other hand this also makes it less readable, since /// spotting the places where the decoder loop may be restarted is less /// obvious. +#if defined(__GNUC__) && (defined(__x86__) || defined(__x86_64__)) +#define rc_bit_case(prob, action0, action1, seq) \ +case seq: \ +do { \ + rc_normalize(seq); \ + uint32_t cache = (prob), tmp; \ + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \ + rc.range -= rc_bound; \ + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \ + asm( \ + "cmpl %3, %2\n\t" \ + "cmovbl %3, %1\n\t" \ + "sbbl %0, %0" \ + : "=&r"(tmp), "+&r"(rc.range) \ + : "r"(rc.code), "r"(rc_bound) \ + ); \ + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \ + prob -= cache >> RC_MOVE_BITS; \ + tmp = ~tmp; \ + symbol = (symbol << 1) - tmp; \ + rc.code -= tmp & rc_bound; \ + if (!tmp) { action0; } else { action1; }; \ +} while (0) +#else #define rc_bit_case(prob, action0, action1, seq) \ case seq: rc_bit(prob, action0, action1, seq) +#endif /// Decode a bit without using a probability. -- 2.17.1