Skip to content

Instantly share code, notes, and snippets.

@ilyakurdyukov
Last active June 18, 2023 17:04
Show Gist options
  • Select an option

  • Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.

Select an option

Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.

Revisions

  1. ilyakurdyukov revised this gist Dec 14, 2021. 1 changed file with 33 additions and 6 deletions.
    39 changes: 33 additions & 6 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    From f0d2f989c249605e8c951b6770d2c05317fa8114 Mon Sep 17 00:00:00 2001
    From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Tue, 14 Dec 2021 18:37:22 +0700
    Date: Tue, 14 Dec 2021 21:54:32 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.
    @@ -9,8 +9,8 @@ Another change that removes the comparison with in_size can give a few
    percent speedup for architectures with a small number of registers.
    ---
    src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++-------------
    src/liblzma/rangecoder/range_decoder.h | 51 +++++++++++++++--
    2 files changed, 86 insertions(+), 43 deletions(-)
    src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
    2 files changed, 113 insertions(+), 43 deletions(-)

    diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
    index e605a0a..ecb804b 100644
    @@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644

    if (rep0 == UINT32_MAX) {
    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..4bd78dd 100644
    index e0b051f..cc9ac35 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    @@ -196,10 +196,11 @@ index e0b051f..4bd78dd 100644
    } \
    } while (0)

    @@ -151,11 +153,52 @@ do { \
    @@ -151,11 +153,79 @@ do { \

    /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
    /// on the decoded bit.
    +#ifndef NO_BRANCH_OPT
    +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
    +do { \
    @@ -214,13 +215,39 @@ index e0b051f..4bd78dd 100644
    + "sbbl %0, %0" \
    + : "=&r"(tmp), "+&r"(rc.range) \
    + : "r"(rc.code), "r"(rc_bound) \
    + : "flags" \
    + ); \
    + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
    + prob -= cache >> RC_MOVE_BITS; \
    + pre0; tmp = ~tmp; pre1; \
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#elif defined(__GNUC__) && defined(__aarch64__)
    +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
    +do { \
    + rc_normalize(seq); \
    + uint32_t cache = (prob), tmp; \
    + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
    + rc.range -= rc_bound; \
    + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
    + __asm__ ( \
    + "cmp %w2, %w3\n\t" \
    + "csel %w1, %w3, %w1, lo\n\t" \
    + "csetm %w0, lo" \
    + : "=&r"(tmp), "+&r"(rc.range) \
    + : "r"(rc.code), "r"(rc_bound) \
    + : "cc" \
    + ); \
    + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
    + prob -= cache >> RC_MOVE_BITS; \
    + pre0; tmp = ~tmp; pre1; \
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#endif
    +#endif
    +#ifdef rc_bit_nobranch
    +#define rc_bit(prob, action0, action1, seq) \
    + rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
    + action0, action1, seq)
  2. ilyakurdyukov revised this gist Dec 14, 2021. 1 changed file with 15 additions and 12 deletions.
    27 changes: 15 additions & 12 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    From db71be5c2f77ae18afc73bba85b9c11c6624d243 Mon Sep 17 00:00:00 2001
    From f0d2f989c249605e8c951b6770d2c05317fa8114 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Mon, 13 Dec 2021 21:25:54 +0700
    Date: Tue, 14 Dec 2021 18:37:22 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.
    @@ -158,7 +158,7 @@ index e605a0a..ecb804b 100644

    if (rep0 == UINT32_MAX) {
    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..baec45c 100644
    index e0b051f..4bd78dd 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    @@ -196,10 +196,10 @@ index e0b051f..baec45c 100644
    } \
    } while (0)

    @@ -163,8 +165,49 @@ do { \
    /// statements. On the other hand this also makes it less readable, since
    /// spotting the places where the decoder loop may be restarted is less
    /// obvious.
    @@ -151,11 +153,52 @@ do { \

    /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
    /// on the decoded bit.
    +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
    +do { \
    @@ -221,8 +221,8 @@ index e0b051f..baec45c 100644
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#define rc_bit_case(prob, action0, action1, seq) \
    + case seq: rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
    +#define rc_bit(prob, action0, action1, seq) \
    + rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
    + action0, action1, seq)
    +#define rc_bit_matched(prob, seq) \
    + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
    @@ -234,8 +234,11 @@ index e0b051f..baec45c 100644
    +#define rc_bit_dist_last(prob, offset, seq) \
    + rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
    +#else
    #define rc_bit_case(prob, action0, action1, seq) \
    case seq: rc_bit(prob, action0, action1, seq)
    #define rc_bit(prob, action0, action1, seq) \
    rc_bit_last(prob, \
    symbol <<= 1; action0, \
    symbol = (symbol << 1) + 1; action1, \
    seq);
    +#define rc_bit_matched(prob, seq) \
    + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
    +#define rc_bit_dist(prob, offset, seq) \
    @@ -245,6 +248,6 @@ index e0b051f..baec45c 100644
    +#endif


    /// Decode a bit without using a probability.
    /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
    --
    2.17.1
  3. ilyakurdyukov revised this gist Dec 13, 2021. No changes.
  4. ilyakurdyukov revised this gist Dec 13, 2021. 1 changed file with 142 additions and 14 deletions.
    156 changes: 142 additions & 14 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -1,19 +1,19 @@
    From 32a00b809c5989a196a8045cca7c82d4dcec89a8 Mon Sep 17 00:00:00 2001
    From db71be5c2f77ae18afc73bba85b9c11c6624d243 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Mon, 13 Dec 2021 19:34:03 +0700
    Date: Mon, 13 Dec 2021 21:25:54 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.

    Another change that removes the comparison with in_size can give a few
    percent speedup for architectures with a small number of registers.
    ---
    src/liblzma/lzma/lzma_decoder.c | 9 ++----
    src/liblzma/rangecoder/range_decoder.h | 41 +++++++++++++++++++++++---
    2 files changed, 39 insertions(+), 11 deletions(-)
    src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++-------------
    src/liblzma/rangecoder/range_decoder.h | 51 +++++++++++++++--
    2 files changed, 86 insertions(+), 43 deletions(-)

    diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
    index e605a0a..a11e2f7 100644
    index e605a0a..ecb804b 100644
    --- a/src/liblzma/lzma/lzma_decoder.c
    +++ b/src/liblzma/lzma/lzma_decoder.c
    @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    @@ -39,8 +39,126 @@ index e605a0a..a11e2f7 100644

    d(SEQ_LITERAL_MATCHED0);
    len <<= 1;
    @@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    probs = coder->pos_special + rep0
    - symbol - 1;
    symbol = 1;
    - offset = 0;
    - case SEQ_DIST_MODEL:
    + offset = 1;
    #ifdef HAVE_SMALL
    + limit = 1 << limit;
    + case SEQ_DIST_MODEL:
    do {
    - rc_bit(probs[symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    - } while (++offset < limit);
    + offset <<= 1;
    + } while (offset < limit);
    #else
    + case SEQ_DIST_MODEL:
    switch (limit) {
    case 5:
    assert(offset == 0);
    - rc_bit(probs[symbol], ,
    - rep0 += 1U,
    + rc_bit_dist(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    - ++offset;
    + offset <<= 1;
    --limit;
    case 4:
    - rc_bit(probs[symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    - ++offset;
    + offset <<= 1;
    --limit;
    case 3:
    - rc_bit(probs[symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    - ++offset;
    + offset <<= 1;
    --limit;
    case 2:
    - rc_bit(probs[symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    - ++offset;
    + offset <<= 1;
    --limit;
    case 1:
    // We need "symbol" only for
    @@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    // rc_bit_last() here to omit
    // the unneeded updating of
    // "symbol".
    - rc_bit_last(probs[symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist_last(probs[symbol],
    + offset,
    SEQ_DIST_MODEL);
    }
    #endif
    @@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    rep0 <<= ALIGN_BITS;
    symbol = 1;
    #ifdef HAVE_SMALL
    - offset = 0;
    + offset = 1;
    case SEQ_ALIGN:
    do {
    - rc_bit(coder->pos_align[
    - symbol], ,
    - rep0 += 1U << offset,
    + rc_bit_dist(coder->pos_align[
    + symbol],
    + offset,
    SEQ_ALIGN);
    - } while (++offset < ALIGN_BITS);
    + offset <<= 1;
    + } while (offset < (1 << ALIGN_BITS));
    #else
    case SEQ_ALIGN0:
    - rc_bit(coder->pos_align[symbol], ,
    - rep0 += 1, SEQ_ALIGN0);
    + rc_bit_dist(coder->pos_align[symbol],
    + 1, SEQ_ALIGN0);
    case SEQ_ALIGN1:
    - rc_bit(coder->pos_align[symbol], ,
    - rep0 += 2, SEQ_ALIGN1);
    + rc_bit_dist(coder->pos_align[symbol],
    + 2, SEQ_ALIGN1);
    case SEQ_ALIGN2:
    - rc_bit(coder->pos_align[symbol], ,
    - rep0 += 4, SEQ_ALIGN2);
    + rc_bit_dist(coder->pos_align[symbol],
    + 4, SEQ_ALIGN2);
    case SEQ_ALIGN3:
    // Like in SEQ_DIST_MODEL, we don't
    // need "symbol" for anything else
    // than indexing the probability array.
    - rc_bit_last(coder->pos_align[symbol], ,
    - rep0 += 8, SEQ_ALIGN3);
    + rc_bit_dist_last(coder->pos_align[
    + symbol],
    + 8, SEQ_ALIGN3);
    #endif

    if (rep0 == UINT32_MAX) {
    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..e4a2b25 100644
    index e0b051f..baec45c 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    @@ -78,12 +196,12 @@ index e0b051f..e4a2b25 100644
    } \
    } while (0)

    @@ -163,8 +165,39 @@ do { \
    @@ -163,8 +165,49 @@ do { \
    /// statements. On the other hand this also makes it less readable, since
    /// spotting the places where the decoder loop may be restarted is less
    /// obvious.
    +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    +#define rc_bit_nobranch(prob, pre_action, action0, action1, seq) \
    +#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
    +do { \
    + rc_normalize(seq); \
    + uint32_t cache = (prob), tmp; \
    @@ -99,21 +217,31 @@ index e0b051f..e4a2b25 100644
    + ); \
    + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
    + prob -= cache >> RC_MOVE_BITS; \
    + pre_action; \
    + tmp = ~tmp; \
    + symbol = (symbol << 1) - tmp; \
    + pre0; tmp = ~tmp; pre1; \
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#define rc_bit_case(prob, action0, action1, seq) \
    + case seq: rc_bit_nobranch(prob, , action0, action1, seq)
    + case seq: rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
    + action0, action1, seq)
    +#define rc_bit_matched(prob, seq) \
    + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, , , seq)
    + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
    + symbol = (symbol << 1) - tmp, , , seq)
    +#define rc_bit_dist(prob, offset, seq) \
    + rc_bit_nobranch(prob, , \
    + symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
    + , , seq);
    +#define rc_bit_dist_last(prob, offset, seq) \
    + rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
    +#else
    #define rc_bit_case(prob, action0, action1, seq) \
    case seq: rc_bit(prob, action0, action1, seq)
    +#define rc_bit_matched(prob, seq) \
    + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
    +#define rc_bit_dist(prob, offset, seq) \
    + rc_bit(prob, , rep0 += offset, seq);
    +#define rc_bit_dist_last(prob, offset, seq) \
    + rc_bit_last(prob, , rep0 += offset, seq)
    +#endif


  5. ilyakurdyukov revised this gist Dec 13, 2021. 1 changed file with 44 additions and 10 deletions.
    54 changes: 44 additions & 10 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -1,26 +1,54 @@
    From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
    From 32a00b809c5989a196a8045cca7c82d4dcec89a8 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Mon, 13 Dec 2021 16:19:34 +0700
    Date: Mon, 13 Dec 2021 19:34:03 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.

    Another change that removes the comparison with in_size can give a few
    percent speedup for architectures with a small number of registers.
    ---
    src/liblzma/rangecoder/range_decoder.h | 35 +++++++++++++++++++++++---
    1 file changed, 31 insertions(+), 4 deletions(-)
    src/liblzma/lzma/lzma_decoder.c | 9 ++----
    src/liblzma/rangecoder/range_decoder.h | 41 +++++++++++++++++++++++---
    2 files changed, 39 insertions(+), 11 deletions(-)

    diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
    index e605a0a..a11e2f7 100644
    --- a/src/liblzma/lzma/lzma_decoder.c
    +++ b/src/liblzma/lzma/lzma_decoder.c
    @@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    = offset + match_bit
    + symbol;

    - rc_bit(probs[subcoder_index],
    - offset &= ~match_bit,
    - offset &= match_bit,
    + rc_bit_matched(probs[subcoder_index],
    SEQ_LITERAL_MATCHED);

    // It seems to be faster to do this
    @@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
    case seq: \
    match_bit = len & offset; \
    subcoder_index = offset + match_bit + symbol; \
    - rc_bit(probs[subcoder_index], \
    - offset &= ~match_bit, \
    - offset &= match_bit, \
    - seq)
    + rc_bit_matched(probs[subcoder_index], seq)

    d(SEQ_LITERAL_MATCHED0);
    len <<= 1;
    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..0c3d140 100644
    index e0b051f..e4a2b25 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    /// variables `in' and `in_size' to be defined.
    #define rc_to_local(range_decoder, in_pos) \
    lzma_range_decoder rc = range_decoder; \
    - size_t rc_in_pos = (in_pos); \
    + size_t rc_in_pos = (in_pos) - in_size; \
    + size_t rc_in_pos = in_pos - in_size; \
    + const uint8_t *restrict rc_end = in + in_size; \
    uint32_t rc_bound

    @@ -50,20 +78,19 @@ index e0b051f..0c3d140 100644
    } \
    } while (0)

    @@ -163,8 +165,33 @@ do { \
    @@ -163,8 +165,39 @@ do { \
    /// statements. On the other hand this also makes it less readable, since
    /// spotting the places where the decoder loop may be restarted is less
    /// obvious.
    +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    +#define rc_bit_case(prob, action0, action1, seq) \
    +case seq: \
    +#define rc_bit_nobranch(prob, pre_action, action0, action1, seq) \
    +do { \
    + rc_normalize(seq); \
    + uint32_t cache = (prob), tmp; \
    + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
    + rc.range -= rc_bound; \
    + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
    + asm( \
    + __asm__ ( \
    + "cmpl %3, %2\n\t" \
    + "cmovbl %3, %1\n\t" \
    + "sbbl %0, %0" \
    @@ -72,14 +99,21 @@ index e0b051f..0c3d140 100644
    + ); \
    + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
    + prob -= cache >> RC_MOVE_BITS; \
    + pre_action; \
    + tmp = ~tmp; \
    + symbol = (symbol << 1) - tmp; \
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#define rc_bit_case(prob, action0, action1, seq) \
    + case seq: rc_bit_nobranch(prob, , action0, action1, seq)
    +#define rc_bit_matched(prob, seq) \
    + rc_bit_nobranch(prob, offset &= match_bit ^ tmp, , , seq)
    +#else
    #define rc_bit_case(prob, action0, action1, seq) \
    case seq: rc_bit(prob, action0, action1, seq)
    +#define rc_bit_matched(prob, seq) \
    + rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
    +#endif


  6. ilyakurdyukov revised this gist Dec 13, 2021. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Mon, 13 Dec 2021 15:58:46 +0700
    Date: Mon, 13 Dec 2021 16:19:34 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.
    @@ -12,7 +12,7 @@ percent speedup for architectures with a small number of registers.
    1 file changed, 31 insertions(+), 4 deletions(-)

    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..a50523a 100644
    index e0b051f..0c3d140 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    @@ -54,7 +54,7 @@ index e0b051f..a50523a 100644
    /// statements. On the other hand this also makes it less readable, since
    /// spotting the places where the decoder loop may be restarted is less
    /// obvious.
    +#if defined(__GNUC__) && (defined(__x86__) || defined(__x86_64__))
    +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    +#define rc_bit_case(prob, action0, action1, seq) \
    +case seq: \
    +do { \
  7. ilyakurdyukov created this gist Dec 13, 2021.
    88 changes: 88 additions & 0 deletions faster_lzma_decoder_x86.patch
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,88 @@
    From a5a557f4ef82c6574b6fbe2b44030fbfe69be4d5 Mon Sep 17 00:00:00 2001
    From: Ilya Kurdyukov <jpegqs@gmail.com>
    Date: Mon, 13 Dec 2021 15:58:46 +0700
    Subject: [PATCH] faster lzma_decoder for x86

    Notice: Uses inline assembly with CMOV instruction.

    Another change that removes the comparison with in_size can give a few
    percent speedup for architectures with a small number of registers.
    ---
    src/liblzma/rangecoder/range_decoder.h | 35 +++++++++++++++++++++++---
    1 file changed, 31 insertions(+), 4 deletions(-)

    diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
    index e0b051f..a50523a 100644
    --- a/src/liblzma/rangecoder/range_decoder.h
    +++ b/src/liblzma/rangecoder/range_decoder.h
    @@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    /// variables `in' and `in_size' to be defined.
    #define rc_to_local(range_decoder, in_pos) \
    lzma_range_decoder rc = range_decoder; \
    - size_t rc_in_pos = (in_pos); \
    + size_t rc_in_pos = (in_pos) - in_size; \
    + const uint8_t *restrict rc_end = in + in_size; \
    uint32_t rc_bound


    @@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
    #define rc_from_local(range_decoder, in_pos) \
    do { \
    range_decoder = rc; \
    - in_pos = rc_in_pos; \
    + in_pos = rc_in_pos + in_size; \
    } while (0)


    @@ -87,12 +88,13 @@ do { \
    #define rc_normalize(seq) \
    do { \
    if (rc.range < RC_TOP_VALUE) { \
    - if (unlikely(rc_in_pos == in_size)) { \
    + if (unlikely(!rc_in_pos)) { \
    coder->sequence = seq; \
    goto out; \
    } \
    + rc.code <<= RC_SHIFT_BITS; \
    rc.range <<= RC_SHIFT_BITS; \
    - rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
    + rc.code |= rc_end[rc_in_pos++]; \
    } \
    } while (0)

    @@ -163,8 +165,33 @@ do { \
    /// statements. On the other hand this also makes it less readable, since
    /// spotting the places where the decoder loop may be restarted is less
    /// obvious.
    +#if defined(__GNUC__) && (defined(__x86__) || defined(__x86_64__))
    +#define rc_bit_case(prob, action0, action1, seq) \
    +case seq: \
    +do { \
    + rc_normalize(seq); \
    + uint32_t cache = (prob), tmp; \
    + rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
    + rc.range -= rc_bound; \
    + /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
    + asm( \
    + "cmpl %3, %2\n\t" \
    + "cmovbl %3, %1\n\t" \
    + "sbbl %0, %0" \
    + : "=&r"(tmp), "+&r"(rc.range) \
    + : "r"(rc.code), "r"(rc_bound) \
    + ); \
    + cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
    + prob -= cache >> RC_MOVE_BITS; \
    + tmp = ~tmp; \
    + symbol = (symbol << 1) - tmp; \
    + rc.code -= tmp & rc_bound; \
    + if (!tmp) { action0; } else { action1; }; \
    +} while (0)
    +#else
    #define rc_bit_case(prob, action0, action1, seq) \
    case seq: rc_bit(prob, action0, action1, seq)
    +#endif


    /// Decode a bit without using a probability.
    --
    2.17.1