Skip to content

Instantly share code, notes, and snippets.

@mmozeiko
Last active May 22, 2025 09:09
Show Gist options
  • Select an option

  • Save mmozeiko/f9c999dda7dbb03722409854a1c39cc2 to your computer and use it in GitHub Desktop.

Select an option

Save mmozeiko/f9c999dda7dbb03722409854a1c39cc2 to your computer and use it in GitHub Desktop.

Revisions

  1. mmozeiko revised this gist Mar 15, 2023. 1 changed file with 15 additions and 28 deletions.
    43 changes: 15 additions & 28 deletions meow_hash_armv8.h
    Original file line number Diff line number Diff line change
    @@ -46,25 +46,13 @@
    #define movq(A, B) A = vreinterpretq_u8_u64((uint64x2_t){ (B), 0 })
    #define paddq(A, B) A = vreinterpretq_u8_u64(vaddq_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))

    #define pxor_clear(A) A = (uint8x16_t){}
    #define pxor(A, B) A = veorq_u8(A, B)
    #define pand(A, B) A = vandq_u8(A, B)
    #define pxor_clear(A) A = vdupq_n_u8(0)
    #define pxor(A, B) A = veorq_u8(A, B)
    #define pand(A, B) A = vandq_u8(A, B)
    #define palignr(A, B, i) A = vextq_u8(B, A, i)
    #define pshufb(A, B) A = vqtbl1q_u8(A, B)

    #define palignr1(A, B) A = \
    vorrq_u8( \
    vqtbl1q_u8(A, (uint8x16_t){0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0}), \
    vqtbl1q_u8(B, (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0xff}) \
    )

    #define palignr15(A, B) A = \
    vorrq_u8( \
    vqtbl1q_u8(A, (uint8x16_t){0xff,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}), \
    vqtbl1q_u8(B, (uint8x16_t){15,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}) \
    )

    #define pshufb(A, B) A = vqtbl1q_u8(A, B)

    #define aesdec(A, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (uint8x16_t){})), (B))
    #define aesdec(A, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), vdupq_n_u8(0))), (B))
    #define aesdec_xor(A, X, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (X))), (B))

    #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) do { \
    @@ -206,8 +194,8 @@ MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);
    palignr(xmm8, xmm11, 15);
    palignr(xmm10, xmm11, 1);

    // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
    // the decision was made to leave them zero'd so as not to confuse people
    @@ -216,8 +204,8 @@ MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);
    palignr(xmm12, xmm15, 15);
    palignr(xmm14, xmm15, 1);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);
    @@ -417,15 +405,15 @@ MeowEnd(meow_state* State, meow_u8* Store128)

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);
    palignr(xmm8, xmm11, 15);
    palignr(xmm10, xmm11, 1);

    pxor_clear(xmm12);
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);
    palignr(xmm12, xmm15, 15);
    palignr(xmm14, xmm15, 1);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);
    @@ -494,8 +482,7 @@ MeowEnd(meow_state* State, meow_u8* Store128)
    #undef pxor
    #undef paddq
    #undef pand
    #undef palignr1
    #undef palignr15
    #undef palignr
    #undef pxor_clear
    #undef MEOW_MIX
    #undef MEOW_MIX_REG
  2. mmozeiko revised this gist Jun 29, 2020. 1 changed file with 7 additions and 4 deletions.
    11 changes: 7 additions & 4 deletions meow_hash_armv8.h
    Original file line number Diff line number Diff line change
    @@ -6,13 +6,16 @@
    // Meow hash v0.5 with ARMv8 Crypto Extension instructions
    // Ported from https://github.com/cmuratori/meow_hash

    // Performance on Pine A64 (1.2GHz Cortex-A53)
    // Performance on Pine A64 (Cortex-A53, 1.2GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a53)
    // C code = ~0.34 bytes/cycle
    // this code = ~1.75 bytes/cycle

    // Performance on Jetson Nano (1.43GHz Cortex-A57)
    // C code = ~?? bytes/cycle
    // this code = ~?? bytes/cycle
    // Performance on Jetson Nano (Cortex-A57, 1.43GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a57)
    // C code = ~0.5 bytes/cycle
    // this code = ~4.0 bytes/cycle (for ~1MB or less data)
    // ~2.5 bytes/cycle (for >1MB)

    #include <stddef.h>
    #include <stdint.h>
  3. mmozeiko revised this gist Jun 29, 2020. 2 changed files with 62 additions and 65 deletions.
    63 changes: 30 additions & 33 deletions meow_hash_armv8.h
    Original file line number Diff line number Diff line change
    @@ -6,16 +6,13 @@
    // Meow hash v0.5 with ARMv8 Crypto Extension instructions
    // Ported from https://github.com/cmuratori/meow_hash

    // Performance on Pine A64 (Cortex-A53, 1.2GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a53)
    // Performance on Pine A64 (1.2GHz Cortex-A53)
    // C code = ~0.34 bytes/cycle
    // this code = ~1.75 bytes/cycle

    // Performance on Jetson Nano (Cortex-A57, 1.43GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a57)
    // C code = ~0.5 bytes/cycle
    // this code = ~4.0 bytes/cycle (for ~1MB or less data)
    // ~2.5 bytes/cycle (for >1MB)
    // Performance on Jetson Nano (1.43GHz Cortex-A57)
    // C code = ~?? bytes/cycle
    // this code = ~?? bytes/cycle

    #include <stddef.h>
    #include <stdint.h>
    @@ -68,11 +65,11 @@
    #define aesdec_xor(A, X, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (X))), (B))

    #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) do { \
    aesdec(r1, r2); \
    paddq(r3, i1); \
    aesdec_xor(r2, i2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4); \
    aesdec(r1, r2); \
    paddq(r3, i1); \
    aesdec_xor(r2, i2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4); \
    } while (0)

    #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \
    @@ -84,11 +81,11 @@
    } while (0)

    #define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    aesdec_xor(r4, r6, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    aesdec_xor(r4, r6, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    } while (0)

    static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    @@ -98,21 +95,21 @@ static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,
    static meow_u8 MeowDefaultSeed[128] =
    {
    0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D,
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6,
    };

    //
    @@ -129,7 +126,7 @@ MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    meow_u8* rcx = (meow_u8*)Seed128Init;

    //
    // NOTE(casey): Seed the eight hash registers
    // NOTE(casey): Seed the eight hash registers
    //

    movdqu(xmm0, rcx + 0x00);
    64 changes: 32 additions & 32 deletions meow_hash_c.h
    Original file line number Diff line number Diff line change
    @@ -212,12 +212,12 @@ static const meow_u32 MeowAesBox3[256] = {
    } while (0)

    #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \
    aesdec(r1, r2); \
    paddq(r3, i1); \
    pxor(r2, i2); \
    aesdec(r2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4);
    aesdec(r1, r2); \
    paddq(r3, i1); \
    pxor(r2, i2); \
    aesdec(r2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4);

    #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \
    meow_u128 i1, i2, i3, i4; \
    @@ -229,33 +229,33 @@ static const meow_u32 MeowAesBox3[256] = {
    } while (0)

    #define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    pxor(r4, r6); \
    aesdec(r4, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    pxor(r4, r6); \
    aesdec(r4, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    } while (0)

    // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi.
    static meow_u8 MeowDefaultSeed[128] =
    {
    0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D,
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6,
    };

    //
    @@ -272,7 +272,7 @@ MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    meow_u8* rcx = (meow_u8*)Seed128Init;

    //
    // NOTE(casey): Seed the eight hash registers
    // NOTE(casey): Seed the eight hash registers
    //

    movdqu(xmm0, rcx + 0x00);
    @@ -325,11 +325,11 @@ MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    {
    for (meow_umm i=0; i<Len8; i++)
    {
    xmm9.u8[i] = Last[i];
    xmm9.u8[i] = Last[i];
    }
    for (meow_umm i=Len8; i<0x10; i++)
    {
    xmm9.u8[i] = 0;
    xmm9.u8[i] = 0;
    }
    }

    @@ -546,11 +546,11 @@ MeowEnd(meow_state* State, meow_u8* Store128)
    {
    for (meow_umm i=0; i<Len8; i++)
    {
    xmm9.u8[i] = Last[i];
    xmm9.u8[i] = Last[i];
    }
    for (meow_umm i=Len8; i<0x10; i++)
    {
    xmm9.u8[i] = 0;
    xmm9.u8[i] = 0;
    }
    }

  4. mmozeiko revised this gist Jun 29, 2020. 1 changed file with 525 additions and 0 deletions.
    525 changes: 525 additions & 0 deletions meow_hash_armv8.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,525 @@
    #pragma once

    #define MEOW_HASH_VERSION 5
    #define MEOW_HASH_VERSION_NAME "0.5/calico"

    // Meow hash v0.5 with ARMv8 Crypto Extension instructions
    // Ported from https://github.com/cmuratori/meow_hash

    // Performance on Pine A64 (Cortex-A53, 1.2GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a53)
    // C code = ~0.34 bytes/cycle
    // this code = ~1.75 bytes/cycle

    // Performance on Jetson Nano (Cortex-A57, 1.43GHz)
    // (compiled with clang v10.0 with -O3 -mcpu=cortex-a57)
    // C code = ~0.5 bytes/cycle
    // this code = ~4.0 bytes/cycle (for ~1MB or less data)
    // ~2.5 bytes/cycle (for >1MB)

    #include <stddef.h>
    #include <stdint.h>
    #include <arm_neon.h>

    #if !defined MEOW_PAGESIZE
    #define MEOW_PAGESIZE 4096
    #endif

    #if !defined(meow_u8)

    #define meow_u8 uint8_t
    #define meow_u32 uint32_t
    #define meow_u64 uint64_t
    #define meow_umm size_t
    #define meow_u128 uint8x16_t

    #endif

    #define MeowU32From(A, I) vgetq_lane_u32(vreinterpretq_u32_u8(A), (I))
    #define MeowU64From(A, I) vgetq_lane_u64(vreinterpretq_u64_u8(A), (I))

    #define MeowHashesAreEqual(A, B) (MeowU64From(A, 0) == MeowU64From(B, 0) && MeowU64From(A, 1) == MeowU64From(B, 1))

    #define movdqu(A, B) A = vld1q_u8(B)
    #define movdqu_mem(A, B) vst1q_u8((A), (B))

    #define movq(A, B) A = vreinterpretq_u8_u64((uint64x2_t){ (B), 0 })
    #define paddq(A, B) A = vreinterpretq_u8_u64(vaddq_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))

    #define pxor_clear(A) A = (uint8x16_t){}
    #define pxor(A, B) A = veorq_u8(A, B)
    #define pand(A, B) A = vandq_u8(A, B)

    #define palignr1(A, B) A = \
    vorrq_u8( \
    vqtbl1q_u8(A, (uint8x16_t){0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0}), \
    vqtbl1q_u8(B, (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0xff}) \
    )

    #define palignr15(A, B) A = \
    vorrq_u8( \
    vqtbl1q_u8(A, (uint8x16_t){0xff,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}), \
    vqtbl1q_u8(B, (uint8x16_t){15,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}) \
    )

    #define pshufb(A, B) A = vqtbl1q_u8(A, B)

    #define aesdec(A, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (uint8x16_t){})), (B))
    #define aesdec_xor(A, X, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (X))), (B))

    #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) do { \
    aesdec(r1, r2); \
    paddq(r3, i1); \
    aesdec_xor(r2, i2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4); \
    } while (0)

    #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \
    meow_u128 i1 = vld1q_u8(ptr + 15); \
    meow_u128 i2 = vld1q_u8(ptr + 0); \
    meow_u128 i3 = vld1q_u8(ptr + 1); \
    meow_u128 i4 = vld1q_u8(ptr + 16); \
    MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4); \
    } while (0)

    #define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    aesdec_xor(r4, r6, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    } while (0)

    static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};

    // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi.
    static meow_u8 MeowDefaultSeed[128] =
    {
    0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D,
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6
    };

    //
    // NOTE(casey): Single block version
    //

    static meow_u128
    MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    {
    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)

    meow_u8* rax = (meow_u8*)SourceInit;
    meow_u8* rcx = (meow_u8*)Seed128Init;

    //
    // NOTE(casey): Seed the eight hash registers
    //

    movdqu(xmm0, rcx + 0x00);
    movdqu(xmm1, rcx + 0x10);
    movdqu(xmm2, rcx + 0x20);
    movdqu(xmm3, rcx + 0x30);

    movdqu(xmm4, rcx + 0x40);
    movdqu(xmm5, rcx + 0x50);
    movdqu(xmm6, rcx + 0x60);
    movdqu(xmm7, rcx + 0x70);

    //
    // NOTE(casey): Hash all full 256-byte blocks
    //

    meow_umm BlockCount = (Len >> 8);
    while (BlockCount--)
    {
    MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
    MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
    MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
    MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
    MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
    MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
    MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
    MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);

    rax += 0x100;
    }

    //
    // NOTE(casey): Load any less-than-32-byte residual
    //

    pxor_clear(xmm9);
    pxor_clear(xmm11);

    //
    // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
    // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
    // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
    // to the & 0xf on the align computation.
    //

    // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
    meow_u8 *Last = (meow_u8*)SourceInit + (Len & ~0xf);
    meow_u32 Len8 = (Len & 0xf);
    if (Len8)
    {
    // NOTE(casey): Load the mask early
    movdqu(xmm8, &MeowMaskLen[0x10 - Len8]);

    meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16);
    int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0;
    movdqu(xmm10, &MeowShiftAdjust[Align]);
    movdqu(xmm9, Last - Align);
    pshufb(xmm9, xmm10);

    // NOTE(jeffr): and off the extra bytes
    pand(xmm9, xmm8);
    }

    // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
    if (Len & 0x10)
    {
    xmm11 = xmm9;
    movdqu(xmm9, Last - 0x10);
    }

    //
    // NOTE(casey): Construct the residual and length injests
    //

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);

    // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
    // the decision was made to leave them zero'd so as not to confuse people
    // about hwo to use them or what security implications they had.
    pxor_clear(xmm12);
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);

    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);

    //
    // NOTE(casey): Hash all full 32-byte blocks
    //
    meow_u32 LaneCount = (Len >> 5) & 0x7;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;

    //
    // NOTE(casey): Mix the eight lanes down to one 128-bit hash
    //

    MixDown:

    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);

    paddq(xmm0, xmm2);
    paddq(xmm1, xmm3);
    paddq(xmm4, xmm6);
    paddq(xmm5, xmm7);
    pxor(xmm0, xmm1);
    pxor(xmm4, xmm5);
    paddq(xmm0, xmm4);

    return xmm0;
    }

    //
    // NOTE(casey): Streaming construction
    //

    typedef struct
    {
    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
    meow_u64 TotalLengthInBytes;

    meow_u32 BufferLen;

    meow_u8 Buffer[256];
    meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary
    } meow_state;

    static void
    MeowBegin(meow_state* State, void* Seed128)
    {
    meow_u8* rcx = (meow_u8*)Seed128;

    movdqu(State->xmm0, rcx + 0x00);
    movdqu(State->xmm1, rcx + 0x10);
    movdqu(State->xmm2, rcx + 0x20);
    movdqu(State->xmm3, rcx + 0x30);
    movdqu(State->xmm4, rcx + 0x40);
    movdqu(State->xmm5, rcx + 0x50);
    movdqu(State->xmm6, rcx + 0x60);
    movdqu(State->xmm7, rcx + 0x70);

    State->BufferLen = 0;
    State->TotalLengthInBytes = 0;
    }

    static void
    MeowAbsorbBlocks(meow_state* State, meow_umm BlockCount, meow_u8* rax)
    {
    meow_u128 xmm0 = State->xmm0;
    meow_u128 xmm1 = State->xmm1;
    meow_u128 xmm2 = State->xmm2;
    meow_u128 xmm3 = State->xmm3;
    meow_u128 xmm4 = State->xmm4;
    meow_u128 xmm5 = State->xmm5;
    meow_u128 xmm6 = State->xmm6;
    meow_u128 xmm7 = State->xmm7;

    while (BlockCount--)
    {
    MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
    MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
    MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
    MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
    MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
    MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
    MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
    MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);

    rax += 0x100;
    }

    State->xmm0 = xmm0;
    State->xmm1 = xmm1;
    State->xmm2 = xmm2;
    State->xmm3 = xmm3;
    State->xmm4 = xmm4;
    State->xmm5 = xmm5;
    State->xmm6 = xmm6;
    State->xmm7 = xmm7;
    }

    static void
    MeowAbsorb(meow_state* State, meow_umm Len, void* SourceInit)
    {
    State->TotalLengthInBytes += Len;
    meow_u8* Source = (meow_u8*)SourceInit;

    // NOTE(casey): Handle any buffered residual
    if (State->BufferLen)
    {
    meow_u32 Fill = (sizeof(State->Buffer) - State->BufferLen);
    if (Fill > Len)
    {
    Fill = (meow_u32)Len;
    }

    Len -= Fill;
    while (Fill--)
    {
    State->Buffer[State->BufferLen++] = *Source++;
    }

    if (State->BufferLen == sizeof(State->Buffer))
    {
    MeowAbsorbBlocks(State, 1, State->Buffer);
    State->BufferLen = 0;
    }
    }

    // NOTE(casey): Handle any full blocks
    meow_u64 BlockCount = (Len >> 8);
    meow_u64 Advance = (BlockCount << 8);
    MeowAbsorbBlocks(State, BlockCount, Source);

    Len -= Advance;
    Source += Advance;

    // NOTE(casey): Store residual
    while (Len--)
    {
    State->Buffer[State->BufferLen++] = *Source++;
    }
    }

    static meow_u128
    MeowEnd(meow_state* State, meow_u8* Store128)
    {
    meow_u64 Len = State->TotalLengthInBytes;

    meow_u128 xmm0 = State->xmm0;
    meow_u128 xmm1 = State->xmm1;
    meow_u128 xmm2 = State->xmm2;
    meow_u128 xmm3 = State->xmm3;
    meow_u128 xmm4 = State->xmm4;
    meow_u128 xmm5 = State->xmm5;
    meow_u128 xmm6 = State->xmm6;
    meow_u128 xmm7 = State->xmm7;

    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

    meow_u8* rax = State->Buffer;

    pxor_clear(xmm9);
    pxor_clear(xmm11);

    meow_u8* Last = (uint8_t*)rax + (Len & 0xf0);
    meow_u32 Len8 = (Len & 0xf);
    if (Len8)
    {
    movdqu(xmm8, &MeowMaskLen[0x10 - Len8]);
    movdqu(xmm9, Last);
    pand(xmm9, xmm8);
    }

    if (Len & 0x10)
    {
    xmm11 = xmm9;
    movdqu(xmm9, Last - 0x10);
    }

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);

    pxor_clear(xmm12);
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);

    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);

    //
    // NOTE(casey): Hash all full 32-byte blocks
    //
    meow_u32 LaneCount = (Len >> 5) & 0x7;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;

    //
    // NOTE(casey): Mix the eight lanes down to one 128-bit hash
    //

    MixDown:

    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);

    if (Store128)
    {
    movdqu_mem(Store128 + 0x00, xmm0);
    movdqu_mem(Store128 + 0x10, xmm1);
    movdqu_mem(Store128 + 0x20, xmm2);
    movdqu_mem(Store128 + 0x30, xmm3);
    movdqu_mem(Store128 + 0x40, xmm4);
    movdqu_mem(Store128 + 0x50, xmm5);
    movdqu_mem(Store128 + 0x60, xmm6);
    movdqu_mem(Store128 + 0x70, xmm7);
    }

    paddq(xmm0, xmm2);
    paddq(xmm1, xmm3);
    paddq(xmm4, xmm6);
    paddq(xmm5, xmm7);
    pxor(xmm0, xmm1);
    pxor(xmm4, xmm5);
    paddq(xmm0, xmm4);

    return(xmm0);
    }

    #undef movdqu
    #undef movdqu_mem
    #undef movq
    #undef aesdec
    #undef pxor
    #undef paddq
    #undef pand
    #undef palignr1
    #undef palignr15
    #undef pxor_clear
    #undef MEOW_MIX
    #undef MEOW_MIX_REG
    #undef MEOW_SHUFFLE

    //
    // NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed
    // to create a seed which you then store for repeated use. It is _expensive_ to generate the seed,
    // so you do not want to do this every time you hash. You _only_ want to do it when you actually
    // need to create a new seed.
    //

    static void
    MeowExpandSeed(meow_umm InputLen, void* Input, meow_u8* SeedResult)
    {
    meow_state State;
    meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results
    meow_umm InjestCount = (256 / InputLen) + 2;

    MeowBegin(&State, MeowDefaultSeed);
    MeowAbsorb(&State, sizeof(LengthTab), &LengthTab);
    while (InjestCount--)
    {
    MeowAbsorb(&State, InputLen, Input);
    }
    MeowEnd(&State, SeedResult);
    }
  5. mmozeiko created this gist Jun 29, 2020.
    670 changes: 670 additions & 0 deletions meow_hash_c.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,670 @@
    #pragma once

    #define MEOW_HASH_VERSION 5
    #define MEOW_HASH_VERSION_NAME "0.5/calico"

    // Meow hash v0.5 in C without dependency on special CPU instructions
    // Ported from https://github.com/cmuratori/meow_hash

    // Performance on Ryzen 9 3950X
    // AESNI code = ~16 bytes/cycle
    // this code = ~0.78 bytes/cycle

    #include <stddef.h>
    #include <stdint.h>
    #include <string.h>

    #if !defined MEOW_PAGESIZE
    #define MEOW_PAGESIZE 4096
    #endif

    #if !defined(meow_u8)

    #define meow_u8 uint8_t
    #define meow_u32 uint32_t
    #define meow_u64 uint64_t
    #define meow_umm size_t

    typedef union
    {
    meow_u8 u8[16];
    meow_u32 u32[4];
    meow_u64 u64[2];
    } meow_u128;

    #endif

    #define MeowU32From(A, I) ((A).u32[I])
    #define MeowU64From(A, I) ((A).u64[I])

    #define MeowHashesAreEqual(A, B) ((A).u64[0] == (B).u64[0] && (A).u64[1] == (B).u64[1])

    static const meow_u32 MeowAesBox0[256] = {
    0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
    0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5,
    0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b,
    0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e,
    0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d,
    0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9,
    0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566,
    0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed,
    0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4,
    0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd,
    0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060,
    0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879,
    0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c,
    0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624,
    0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c,
    0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14,
    0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b,
    0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684,
    0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177,
    0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322,
    0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f,
    0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382,
    0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb,
    0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef,
    0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235,
    0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117,
    0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546,
    0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d,
    0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a,
    0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478,
    0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff,
    0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0,
    };

    static const meow_u32 MeowAesBox1[256] = {
    0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3,
    0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362,
    0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3,
    0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9,
    0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace,
    0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08,
    0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55,
    0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216,
    0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6,
    0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e,
    0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550,
    0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8,
    0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a,
    0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36,
    0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12,
    0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e,
    0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb,
    0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6,
    0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1,
    0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033,
    0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad,
    0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3,
    0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b,
    0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815,
    0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2,
    0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691,
    0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165,
    0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6,
    0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147,
    0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44,
    0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d,
    0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
    };

    static const meow_u32 MeowAesBox2[256] = {
    0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303,
    0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3,
    0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9,
    0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8,
    0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a,
    0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b,
    0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab,
    0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82,
    0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe,
    0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110,
    0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15,
    0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee,
    0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72,
    0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e,
    0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a,
    0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9,
    0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e,
    0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011,
    0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3,
    0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90,
    0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf,
    0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af,
    0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb,
    0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8,
    0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066,
    0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6,
    0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51,
    0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347,
    0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1,
    0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db,
    0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195,
    0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257,
    };

    static const meow_u32 MeowAesBox3[256] = {
    0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93,
    0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f,
    0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6,
    0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44,
    0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4,
    0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994,
    0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a,
    0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c,
    0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a,
    0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51,
    0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff,
    0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db,
    0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e,
    0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a,
    0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16,
    0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8,
    0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34,
    0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420,
    0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0,
    0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef,
    0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4,
    0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5,
    0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b,
    0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6,
    0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0,
    0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f,
    0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f,
    0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13,
    0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c,
    0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886,
    0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41,
    0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042,
    };

    #define movdqu(A, B) memcpy(&(A), (B), 16)
    #define movdqu_mem(A, B) memcpy((A), &(B), 16)

    #define movq(A, B) do { (A).u64[0] = B; (A).u64[1] = 0; } while (0)
    #define paddq(A, B) do { (A).u64[0] += (B).u64[0]; (A).u64[1] += (B).u64[1]; } while (0)

    #define pxor_clear(A) do { (A).u64[0] = (A).u64[1] = 0; } while (0)
    #define pxor(A, B) do { (A).u64[0] ^= (B).u64[0]; (A).u64[1] ^= (B).u64[1]; } while (0)
    #define pand(A, B) do { (A).u64[0] &= (B).u64[0]; (A).u64[1] &= (B).u64[1]; } while (0)

    #define palignr1(A, B) do { \
    (A).u64[1] = ((B).u64[1] >> 8) | ((A).u64[0] << 56); \
    (A).u64[0] = ((B).u64[0] >> 8) | ((B).u64[1] << 56); \
    } while (0)

    #define palignr15(A, B) do { \
    (A).u64[1] = ((A).u64[0] >> 56) | ((A).u64[1] << 8); \
    (A).u64[0] = ((B).u64[1] >> 56) | ((A).u64[0] << 8); \
    } while (0)

    #define aesdec(A, B) do { \
    meow_u32 s0 = MeowAesBox0[(A).u8[0]] ^ MeowAesBox1[(A).u8[7]] ^ MeowAesBox2[(A).u8[10]] ^ MeowAesBox3[(A).u8[13]]; \
    meow_u32 s1 = MeowAesBox0[(A).u8[4]] ^ MeowAesBox1[(A).u8[11]] ^ MeowAesBox2[(A).u8[14]] ^ MeowAesBox3[(A).u8[1]]; \
    meow_u32 s2 = MeowAesBox0[(A).u8[8]] ^ MeowAesBox1[(A).u8[15]] ^ MeowAesBox2[(A).u8[2]] ^ MeowAesBox3[(A).u8[5]]; \
    meow_u32 s3 = MeowAesBox0[(A).u8[12]] ^ MeowAesBox1[(A).u8[3]] ^ MeowAesBox2[(A).u8[6]] ^ MeowAesBox3[(A).u8[9]]; \
    (A).u32[0] = s0; \
    (A).u32[1] = s1; \
    (A).u32[2] = s2; \
    (A).u32[3] = s3; \
    pxor(A, B); \
    } while (0)

    #define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \
    aesdec(r1, r2); \
    paddq(r3, i1); \
    pxor(r2, i2); \
    aesdec(r2, r4); \
    paddq(r5, i3); \
    pxor(r4, i4);

    #define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \
    meow_u128 i1, i2, i3, i4; \
    memcpy(&i1, ptr + 15, sizeof(i1)); \
    memcpy(&i2, ptr + 0, sizeof(i2)); \
    memcpy(&i3, ptr + 1, sizeof(i3)); \
    memcpy(&i4, ptr + 16, sizeof(i4)); \
    MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4); \
    } while (0)

    #define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \
    aesdec(r1, r4); \
    paddq(r2, r5); \
    pxor(r4, r6); \
    aesdec(r4, r2); \
    paddq(r5, r6); \
    pxor(r2, r3); \
    } while (0)

    // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi.
    static meow_u8 MeowDefaultSeed[128] =
    {
    0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D,
    0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
    0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
    0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
    0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
    0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
    0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
    0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
    0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
    0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
    0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
    0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
    0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
    0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
    0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
    0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6
    };

    //
    // NOTE(casey): Single block version
    //

    static meow_u128
    MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit)
    {
    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)

    meow_u8* rax = (meow_u8*)SourceInit;
    meow_u8* rcx = (meow_u8*)Seed128Init;

    //
    // NOTE(casey): Seed the eight hash registers
    //

    movdqu(xmm0, rcx + 0x00);
    movdqu(xmm1, rcx + 0x10);
    movdqu(xmm2, rcx + 0x20);
    movdqu(xmm3, rcx + 0x30);

    movdqu(xmm4, rcx + 0x40);
    movdqu(xmm5, rcx + 0x50);
    movdqu(xmm6, rcx + 0x60);
    movdqu(xmm7, rcx + 0x70);

    //
    // NOTE(casey): Hash all full 256-byte blocks
    //

    meow_umm BlockCount = (Len >> 8);
    while (BlockCount--)
    {
    MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
    MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
    MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
    MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
    MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
    MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
    MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
    MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);

    rax += 0x100;
    }

    //
    // NOTE(casey): Load any less-than-32-byte residual
    //

    pxor_clear(xmm9);
    pxor_clear(xmm11);

    //
    // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
    // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
    // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
    // to the & 0xf on the align computation.
    //

    // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
    meow_u8 *Last = (meow_u8*)SourceInit + (Len & ~0xf);
    meow_u32 Len8 = (Len & 0xf);
    if (Len8)
    {
    for (meow_umm i=0; i<Len8; i++)
    {
    xmm9.u8[i] = Last[i];
    }
    for (meow_umm i=Len8; i<0x10; i++)
    {
    xmm9.u8[i] = 0;
    }
    }

    // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
    if (Len & 0x10)
    {
    xmm11 = xmm9;
    movdqu(xmm9, Last - 0x10);
    }

    //
    // NOTE(casey): Construct the residual and length injests
    //

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);

    // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
    // the decision was made to leave them zero'd so as not to confuse people
    // about hwo to use them or what security implications they had.
    pxor_clear(xmm12);
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);

    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);

    //
    // NOTE(casey): Hash all full 32-byte blocks
    //
    meow_u32 LaneCount = (Len >> 5) & 0x7;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
    if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;

    //
    // NOTE(casey): Mix the eight lanes down to one 128-bit hash
    //

    MixDown:

    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);

    paddq(xmm0, xmm2);
    paddq(xmm1, xmm3);
    paddq(xmm4, xmm6);
    paddq(xmm5, xmm7);
    pxor(xmm0, xmm1);
    pxor(xmm4, xmm5);
    paddq(xmm0, xmm4);

    return xmm0;
    }

    //
    // NOTE(casey): Streaming construction
    //

    typedef struct
    {
    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
    meow_u64 TotalLengthInBytes;

    meow_u32 BufferLen;

    meow_u8 Buffer[256];
    meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary
    } meow_state;

    static void
    MeowBegin(meow_state* State, void* Seed128)
    {
    meow_u8* rcx = (meow_u8*)Seed128;

    movdqu(State->xmm0, rcx + 0x00);
    movdqu(State->xmm1, rcx + 0x10);
    movdqu(State->xmm2, rcx + 0x20);
    movdqu(State->xmm3, rcx + 0x30);
    movdqu(State->xmm4, rcx + 0x40);
    movdqu(State->xmm5, rcx + 0x50);
    movdqu(State->xmm6, rcx + 0x60);
    movdqu(State->xmm7, rcx + 0x70);

    State->BufferLen = 0;
    State->TotalLengthInBytes = 0;
    }

    static void
    MeowAbsorbBlocks(meow_state* State, meow_umm BlockCount, meow_u8* rax)
    {
    meow_u128 xmm0 = State->xmm0;
    meow_u128 xmm1 = State->xmm1;
    meow_u128 xmm2 = State->xmm2;
    meow_u128 xmm3 = State->xmm3;
    meow_u128 xmm4 = State->xmm4;
    meow_u128 xmm5 = State->xmm5;
    meow_u128 xmm6 = State->xmm6;
    meow_u128 xmm7 = State->xmm7;

    while (BlockCount--)
    {
    MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
    MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
    MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
    MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
    MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
    MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
    MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
    MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);

    rax += 0x100;
    }

    State->xmm0 = xmm0;
    State->xmm1 = xmm1;
    State->xmm2 = xmm2;
    State->xmm3 = xmm3;
    State->xmm4 = xmm4;
    State->xmm5 = xmm5;
    State->xmm6 = xmm6;
    State->xmm7 = xmm7;
    }

    static void
    MeowAbsorb(meow_state* State, meow_umm Len, void* SourceInit)
    {
    State->TotalLengthInBytes += Len;
    meow_u8* Source = (meow_u8*)SourceInit;

    // NOTE(casey): Handle any buffered residual
    if (State->BufferLen)
    {
    meow_u32 Fill = (sizeof(State->Buffer) - State->BufferLen);
    if (Fill > Len)
    {
    Fill = (meow_u32)Len;
    }

    Len -= Fill;
    while (Fill--)
    {
    State->Buffer[State->BufferLen++] = *Source++;
    }

    if (State->BufferLen == sizeof(State->Buffer))
    {
    MeowAbsorbBlocks(State, 1, State->Buffer);
    State->BufferLen = 0;
    }
    }

    // NOTE(casey): Handle any full blocks
    meow_u64 BlockCount = (Len >> 8);
    meow_u64 Advance = (BlockCount << 8);
    MeowAbsorbBlocks(State, BlockCount, Source);

    Len -= Advance;
    Source += Advance;

    // NOTE(casey): Store residual
    while (Len--)
    {
    State->Buffer[State->BufferLen++] = *Source++;
    }
    }

    static meow_u128
    MeowEnd(meow_state* State, meow_u8* Store128)
    {
    meow_u64 Len = State->TotalLengthInBytes;

    meow_u128 xmm0 = State->xmm0;
    meow_u128 xmm1 = State->xmm1;
    meow_u128 xmm2 = State->xmm2;
    meow_u128 xmm3 = State->xmm3;
    meow_u128 xmm4 = State->xmm4;
    meow_u128 xmm5 = State->xmm5;
    meow_u128 xmm6 = State->xmm6;
    meow_u128 xmm7 = State->xmm7;

    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

    meow_u8* rax = State->Buffer;

    pxor_clear(xmm9);
    pxor_clear(xmm11);

    meow_u8* Last = (uint8_t*)rax + (Len & 0xf0);
    meow_u32 Len8 = (Len & 0xf);
    if (Len8)
    {
    for (meow_umm i=0; i<Len8; i++)
    {
    xmm9.u8[i] = Last[i];
    }
    for (meow_umm i=Len8; i<0x10; i++)
    {
    xmm9.u8[i] = 0;
    }
    }

    if (Len & 0x10)
    {
    xmm11 = xmm9;
    movdqu(xmm9, Last - 0x10);
    }

    xmm8 = xmm9;
    xmm10 = xmm9;
    palignr15(xmm8, xmm11);
    palignr1(xmm10, xmm11);

    pxor_clear(xmm12);
    pxor_clear(xmm13);
    pxor_clear(xmm14);
    movq(xmm15, Len);
    palignr15(xmm12, xmm15);
    palignr1(xmm14, xmm15);

    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);

    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);

    //
    // NOTE(casey): Hash all full 32-byte blocks
    //
    meow_u32 LaneCount = (Len >> 5) & 0x7;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
    if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;

    //
    // NOTE(casey): Mix the eight lanes down to one 128-bit hash
    //

    MixDown:

    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);

    if (Store128)
    {
    movdqu_mem(Store128 + 0x00, xmm0);
    movdqu_mem(Store128 + 0x10, xmm1);
    movdqu_mem(Store128 + 0x20, xmm2);
    movdqu_mem(Store128 + 0x30, xmm3);
    movdqu_mem(Store128 + 0x40, xmm4);
    movdqu_mem(Store128 + 0x50, xmm5);
    movdqu_mem(Store128 + 0x60, xmm6);
    movdqu_mem(Store128 + 0x70, xmm7);
    }

    paddq(xmm0, xmm2);
    paddq(xmm1, xmm3);
    paddq(xmm4, xmm6);
    paddq(xmm5, xmm7);
    pxor(xmm0, xmm1);
    pxor(xmm4, xmm5);
    paddq(xmm0, xmm4);

    return(xmm0);
    }

    #undef movdqu
    #undef movdqu_mem
    #undef movq
    #undef aesdec
    #undef pxor
    #undef paddq
    #undef pand
    #undef palignr1
    #undef palignr15
    #undef pxor_clear
    #undef MEOW_MIX
    #undef MEOW_MIX_REG
    #undef MEOW_SHUFFLE

    //
    // NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed
    // to create a seed which you then store for repeated use. It is _expensive_ to generate the seed,
    // so you do not want to do this every time you hash. You _only_ want to do it when you actually
    // need to create a new seed.
    //

    static void
    MeowExpandSeed(meow_umm InputLen, void* Input, meow_u8* SeedResult)
    {
    meow_state State;
    meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results
    meow_umm InjestCount = (256 / InputLen) + 2;

    MeowBegin(&State, MeowDefaultSeed);
    MeowAbsorb(&State, sizeof(LengthTab), &LengthTab);
    while (InjestCount--)
    {
    MeowAbsorb(&State, InputLen, Input);
    }
    MeowEnd(&State, SeedResult);
    }