Skip to content

Instantly share code, notes, and snippets.

@rendello
Last active December 25, 2025 23:54
Show Gist options
  • Select an option

  • Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.

Select an option

Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.

Revisions

  1. rendello renamed this gist Nov 1, 2024. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. rendello revised this gist Nov 1, 2024. 2 changed files with 18 additions and 4 deletions.
    15 changes: 15 additions & 0 deletions generate_utf8.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,18 @@
    """
    Copyright (c) 2024 Rendello
    Permission to use, copy, modify, and/or distribute this software for any
    purpose with or without fee is hereby granted.
    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
    REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
    AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
    INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
    LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
    OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
    PERFORMANCE OF THIS SOFTWARE.
    """

    import sys
    from dataclasses import dataclass
    from typing import List, Dict
    7 changes: 3 additions & 4 deletions utf8_case_data.rs
    Original file line number Diff line number Diff line change
    @@ -13,10 +13,9 @@ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
    PERFORMANCE OF THIS SOFTWARE.
    */

    // =======================================================================
    //! Unicode characters that behave oddly when the case is changed, for use
    //! with property tests.
    // =======================================================================
    // ==========================================================================
    //! Unicode codepoints that expand or contract when case is changed in UTF-8.
    // ==========================================================================

    pub const LOWERCASING_CONTRACTS: [&str; 22] = [
    "ẞ", /* ß (3->2), -1 bytes */
  3. rendello revised this gist Nov 1, 2024. No changes.
  4. rendello revised this gist Nov 1, 2024. 1 changed file with 89 additions and 0 deletions.
    89 changes: 89 additions & 0 deletions generate_utf8.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,89 @@
    import sys
    from dataclasses import dataclass
    from typing import List, Dict

    @dataclass
    class Entry:
    a: str
    b: str
    a_len: int
    b_len: int
    delta: int
    a_char_count: int
    b_char_count: int
    delta_char_count: int


    def sort_entries(l: List[Entry]) -> List[Entry]:
    """ Sorted by size delta, then alphabetically. """
    return sorted(l, key=
    lambda p: (-(p.delta_char_count), -(p.delta), p.a))


    def create_entry_map() -> dict[str, list[Entry]]:
    entry_map = {}
    for i in range(sys.maxunicode + 1):
    a = chr(i)

    for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())):
    attributes = [case]

    try:
    a_len = len(a.encode("utf8"))
    b_len = len(b.encode("utf8"))
    except UnicodeEncodeError:
    continue

    if a_len == b_len:
    continue

    delta = b_len - a_len

    a_char_count = len(a)
    b_char_count = len(b)
    delta_char_count = b_char_count - a_char_count

    if a_len < b_len:
    attributes.append('expands')
    elif a_len > b_len:
    attributes.append('contracts')

    if b_char_count > 1:
    attributes.append('multi_char')

    key = "_".join(attributes)
    value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count)

    if key not in entry_map:
    entry_map[key] = [value]
    else:
    entry_map[key].append(value)

    return entry_map


    def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str:
    buffer = (
    f'''// =======================================================================\n'''
    f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n'''
    f'''//! Unicode characters that behave oddly when the case is changed, for use\n'''
    f'''//! with property tests.\n'''
    f'''// =======================================================================\n\n'''
    )
    for key, unsorted_entries in sorted(list(entry_map.items())):
    entries = sort_entries(unsorted_entries)

    buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n'
    for e in entries:

    ds = ""
    if e.delta_char_count != 0:
    ds = f", {e.delta_char_count:+} chars"

    buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n'
    buffer += "];\n\n"
    return buffer.strip()


    def generate_utf8_case_data():
    return entry_map_to_string(create_entry_map())
  5. rendello revised this gist Nov 1, 2024. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion utf8_case_data.rs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    /*
    Copyright (c) 2024 Gaven Rendell
    Copyright (c) 2024 Rendello
    Permission to use, copy, modify, and/or distribute this software for any
    purpose with or without fee is hereby granted.
  6. rendello revised this gist Sep 19, 2024. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions utf8_case_data.rs
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,4 @@
    /*
    BSD Zero Clause License
    Copyright (c) 2024 Gaven Rendell
    Permission to use, copy, modify, and/or distribute this software for any
  7. rendello created this gist Sep 16, 2024.
    192 changes: 192 additions & 0 deletions utf8_case_data.rs
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,192 @@
    /*
    BSD Zero Clause License
    Copyright (c) 2024 Gaven Rendell
    Permission to use, copy, modify, and/or distribute this software for any
    purpose with or without fee is hereby granted.
    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
    REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
    AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
    INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
    LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
    OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
    PERFORMANCE OF THIS SOFTWARE.
    */

    // =======================================================================
    //! Unicode characters that behave oddly when the case is changed, for use
    //! with property tests.
    // =======================================================================

    pub const LOWERCASING_CONTRACTS: [&str; 22] = [
    "ẞ", /* ß (3->2), -1 bytes */
    "Ω", /* ω (3->2), -1 bytes */
    "Å", /* å (3->2), -1 bytes */
    "Ɫ", /* ɫ (3->2), -1 bytes */
    "Ɽ", /* ɽ (3->2), -1 bytes */
    "Ɑ", /* ɑ (3->2), -1 bytes */
    "Ɱ", /* ɱ (3->2), -1 bytes */
    "Ɐ", /* ɐ (3->2), -1 bytes */
    "Ɒ", /* ɒ (3->2), -1 bytes */
    "Ȿ", /* ȿ (3->2), -1 bytes */
    "Ɀ", /* ɀ (3->2), -1 bytes */
    "Ɥ", /* ɥ (3->2), -1 bytes */
    "Ɦ", /* ɦ (3->2), -1 bytes */
    "Ɜ", /* ɜ (3->2), -1 bytes */
    "Ɡ", /* ɡ (3->2), -1 bytes */
    "Ɬ", /* ɬ (3->2), -1 bytes */
    "Ɪ", /* ɪ (3->2), -1 bytes */
    "Ʞ", /* ʞ (3->2), -1 bytes */
    "Ʇ", /* ʇ (3->2), -1 bytes */
    "Ʝ", /* ʝ (3->2), -1 bytes */
    "Ʂ", /* ʂ (3->2), -1 bytes */
    "K", /* k (3->1), -2 bytes */
    ];

    pub const LOWERCASING_EXPANDS: [&str; 2] = [
    "Ⱥ", /* ⱥ (2->3), +1 bytes */
    "Ⱦ", /* ⱦ (2->3), +1 bytes */
    ];

    pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [
    "İ", /* i̇ (2->3), +1 bytes, +1 chars */
    ];

    pub const UPPERCASING_CONTRACTS: [&str; 13] = [
    "ı", /* I (2->1), -1 bytes */
    "ſ", /* S (2->1), -1 bytes */
    "ᲀ", /* В (3->2), -1 bytes */
    "ᲁ", /* Д (3->2), -1 bytes */
    "ᲂ", /* О (3->2), -1 bytes */
    "ᲃ", /* С (3->2), -1 bytes */
    "ᲄ", /* Т (3->2), -1 bytes */
    "ᲅ", /* Т (3->2), -1 bytes */
    "ᲆ", /* Ъ (3->2), -1 bytes */
    "ᲇ", /* Ѣ (3->2), -1 bytes */
    "ι", /* Ι (3->2), -1 bytes */
    "ⱥ", /* Ⱥ (3->2), -1 bytes */
    "ⱦ", /* Ⱦ (3->2), -1 bytes */
    ];

    pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [
    "ff", /* FF (3->2), -1 bytes, +1 chars */
    "fi", /* FI (3->2), -1 bytes, +1 chars */
    "fl", /* FL (3->2), -1 bytes, +1 chars */
    "ſt", /* ST (3->2), -1 bytes, +1 chars */
    "st", /* ST (3->2), -1 bytes, +1 chars */
    ];

    pub const UPPERCASING_EXPANDS: [&str; 18] = [
    "ȿ", /* Ȿ (2->3), +1 bytes */
    "ɀ", /* Ɀ (2->3), +1 bytes */
    "ɐ", /* Ɐ (2->3), +1 bytes */
    "ɑ", /* Ɑ (2->3), +1 bytes */
    "ɒ", /* Ɒ (2->3), +1 bytes */
    "ɜ", /* Ɜ (2->3), +1 bytes */
    "ɡ", /* Ɡ (2->3), +1 bytes */
    "ɥ", /* Ɥ (2->3), +1 bytes */
    "ɦ", /* Ɦ (2->3), +1 bytes */
    "ɪ", /* Ɪ (2->3), +1 bytes */
    "ɫ", /* Ɫ (2->3), +1 bytes */
    "ɬ", /* Ɬ (2->3), +1 bytes */
    "ɱ", /* Ɱ (2->3), +1 bytes */
    "ɽ", /* Ɽ (2->3), +1 bytes */
    "ʂ", /* Ʂ (2->3), +1 bytes */
    "ʇ", /* Ʇ (2->3), +1 bytes */
    "ʝ", /* Ʝ (2->3), +1 bytes */
    "ʞ", /* Ʞ (2->3), +1 bytes */
    ];

    pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [
    "ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */
    "ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */
    "ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */
    "ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */
    "ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */
    "ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */
    "ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */
    "ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */
    "ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */
    "ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */
    "ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */
    "ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */
    "ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */
    "ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */
    "և", /* ԵՒ (2->4), +2 bytes, +1 chars */
    "ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */
    "ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */
    "ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */
    "ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */
    "ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */
    "ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */
    "ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */
    "ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */
    "ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */
    "ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */
    "ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */
    "ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */
    "ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */
    "ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */
    "ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */
    "ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */
    "ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */
    "ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */
    "ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */
    "ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */
    "ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */
    "ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */
    "ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */
    "ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */
    "ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */
    "ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */
    "ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */
    "ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */
    "ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */
    "ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */
    "ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */
    "ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */
    "ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */
    "ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */
    "ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */
    "ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */
    "ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */
    "ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */
    "ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */
    "ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */
    "ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */
    "ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */
    "ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */
    "ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */
    "ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */
    "ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */
    "ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */
    "ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */
    "ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */
    "ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */
    "ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */
    "ʼn", /* ʼN (2->3), +1 bytes, +1 chars */
    "ǰ", /* J̌ (2->3), +1 bytes, +1 chars */
    "ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */
    "ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */
    "ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */
    "ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */
    "ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */
    "ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */
    "ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */
    "ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */
    "ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */
    "ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */
    "ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */
    "ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */
    "ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */
    "ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */
    "ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */
    "ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */
    "ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */
    "ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */
    "ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */
    "ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */
    "ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */
    ];