Last active
December 25, 2025 23:54
-
-
Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
Revisions
-
rendello renamed this gist
Nov 1, 2024 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
rendello revised this gist
Nov 1, 2024 . 2 changed files with 18 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,18 @@ """ Copyright (c) 2024 Rendello Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. """ import sys from dataclasses import dataclass from typing import List, Dict This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,10 +13,9 @@ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // ========================================================================== //! Unicode codepoints that expand or contract when case is changed in UTF-8. // ========================================================================== pub const LOWERCASING_CONTRACTS: [&str; 22] = [ "ẞ", /* ß (3->2), -1 bytes */ -
rendello revised this gist
Nov 1, 2024 . No changes.There are no files selected for viewing
-
rendello revised this gist
Nov 1, 2024 . 1 changed file with 89 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ import sys from dataclasses import dataclass from typing import List, Dict @dataclass class Entry: a: str b: str a_len: int b_len: int delta: int a_char_count: int b_char_count: int delta_char_count: int def sort_entries(l: List[Entry]) -> List[Entry]: """ Sorted by size delta, then alphabetically. """ return sorted(l, key= lambda p: (-(p.delta_char_count), -(p.delta), p.a)) def create_entry_map() -> dict[str, list[Entry]]: entry_map = {} for i in range(sys.maxunicode + 1): a = chr(i) for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())): attributes = [case] try: a_len = len(a.encode("utf8")) b_len = len(b.encode("utf8")) except UnicodeEncodeError: continue if a_len == b_len: continue delta = b_len - a_len a_char_count = len(a) b_char_count = len(b) delta_char_count = b_char_count - a_char_count if a_len < b_len: attributes.append('expands') elif a_len > b_len: attributes.append('contracts') if b_char_count > 1: attributes.append('multi_char') key = "_".join(attributes) value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count) if key not in entry_map: entry_map[key] = [value] else: entry_map[key].append(value) return entry_map def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str: buffer = ( f'''// =======================================================================\n''' f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n''' f'''//! Unicode characters that behave oddly when the case is changed, for use\n''' f'''//! with property tests.\n''' f'''// =======================================================================\n\n''' ) for key, unsorted_entries in sorted(list(entry_map.items())): entries = sort_entries(unsorted_entries) buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n' for e in entries: ds = "" if e.delta_char_count != 0: ds = f", {e.delta_char_count:+} chars" buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n' buffer += "];\n\n" return buffer.strip() def generate_utf8_case_data(): return entry_map_to_string(create_entry_map()) -
rendello revised this gist
Nov 1, 2024 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,5 @@ /* Copyright (c) 2024 Rendello Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. -
rendello revised this gist
Sep 19, 2024 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,4 @@ /* Copyright (c) 2024 Gaven Rendell Permission to use, copy, modify, and/or distribute this software for any -
rendello created this gist
Sep 16, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,192 @@ /* BSD Zero Clause License Copyright (c) 2024 Gaven Rendell Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // ======================================================================= //! Unicode characters that behave oddly when the case is changed, for use //! with property tests. // ======================================================================= pub const LOWERCASING_CONTRACTS: [&str; 22] = [ "ẞ", /* ß (3->2), -1 bytes */ "Ω", /* ω (3->2), -1 bytes */ "Å", /* å (3->2), -1 bytes */ "Ɫ", /* ɫ (3->2), -1 bytes */ "Ɽ", /* ɽ (3->2), -1 bytes */ "Ɑ", /* ɑ (3->2), -1 bytes */ "Ɱ", /* ɱ (3->2), -1 bytes */ "Ɐ", /* ɐ (3->2), -1 bytes */ "Ɒ", /* ɒ (3->2), -1 bytes */ "Ȿ", /* ȿ (3->2), -1 bytes */ "Ɀ", /* ɀ (3->2), -1 bytes */ "Ɥ", /* ɥ (3->2), -1 bytes */ "Ɦ", /* ɦ (3->2), -1 bytes */ "Ɜ", /* ɜ (3->2), -1 bytes */ "Ɡ", /* ɡ (3->2), -1 bytes */ "Ɬ", /* ɬ (3->2), -1 bytes */ "Ɪ", /* ɪ (3->2), -1 bytes */ "Ʞ", /* ʞ (3->2), -1 bytes */ "Ʇ", /* ʇ (3->2), -1 bytes */ "Ʝ", /* ʝ (3->2), -1 bytes */ "Ʂ", /* ʂ (3->2), -1 bytes */ "K", /* k (3->1), -2 bytes */ ]; pub const LOWERCASING_EXPANDS: [&str; 2] = [ "Ⱥ", /* ⱥ (2->3), +1 bytes */ "Ⱦ", /* ⱦ (2->3), +1 bytes */ ]; pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [ "İ", /* i̇ (2->3), +1 bytes, +1 chars */ ]; pub const UPPERCASING_CONTRACTS: [&str; 13] = [ "ı", /* I (2->1), -1 bytes */ "ſ", /* S (2->1), -1 bytes */ "ᲀ", /* В (3->2), -1 bytes */ "ᲁ", /* Д (3->2), -1 bytes */ "ᲂ", /* О (3->2), -1 bytes */ "ᲃ", /* С (3->2), -1 bytes */ "ᲄ", /* Т (3->2), -1 bytes */ "ᲅ", /* Т (3->2), -1 bytes */ "ᲆ", /* Ъ (3->2), -1 bytes */ "ᲇ", /* Ѣ (3->2), -1 bytes */ "ι", /* Ι (3->2), -1 bytes */ "ⱥ", /* Ⱥ (3->2), -1 bytes */ "ⱦ", /* Ⱦ (3->2), -1 bytes */ ]; pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [ "ff", /* FF (3->2), -1 bytes, +1 chars */ "fi", /* FI (3->2), -1 bytes, +1 chars */ "fl", /* FL (3->2), -1 bytes, +1 chars */ "ſt", /* ST (3->2), -1 bytes, +1 chars */ "st", /* ST (3->2), -1 bytes, +1 chars */ ]; pub const UPPERCASING_EXPANDS: [&str; 18] = [ "ȿ", /* Ȿ (2->3), +1 bytes */ "ɀ", /* Ɀ (2->3), +1 bytes */ "ɐ", /* Ɐ (2->3), +1 bytes */ "ɑ", /* Ɑ (2->3), +1 bytes */ "ɒ", /* Ɒ (2->3), +1 bytes */ "ɜ", /* Ɜ (2->3), +1 bytes */ "ɡ", /* Ɡ (2->3), +1 bytes */ "ɥ", /* Ɥ (2->3), +1 bytes */ "ɦ", /* Ɦ (2->3), +1 bytes */ "ɪ", /* Ɪ (2->3), +1 bytes */ "ɫ", /* Ɫ (2->3), +1 bytes */ "ɬ", /* Ɬ (2->3), +1 bytes */ "ɱ", /* Ɱ (2->3), +1 bytes */ "ɽ", /* Ɽ (2->3), +1 bytes */ "ʂ", /* Ʂ (2->3), +1 bytes */ "ʇ", /* Ʇ (2->3), +1 bytes */ "ʝ", /* Ʝ (2->3), +1 bytes */ "ʞ", /* Ʞ (2->3), +1 bytes */ ]; pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [ "ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */ "ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */ "ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */ "ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */ "ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */ "ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */ "ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */ "ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */ "ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */ "ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */ "ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */ "ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */ "ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */ "ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */ "և", /* ԵՒ (2->4), +2 bytes, +1 chars */ "ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */ "ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */ "ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */ "ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */ "ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */ "ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */ "ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */ "ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */ "ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */ "ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */ "ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */ "ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */ "ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */ "ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */ "ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */ "ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */ "ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */ "ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */ "ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */ "ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */ "ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */ "ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */ "ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */ "ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */ "ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */ "ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */ "ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */ "ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */ "ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */ "ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */ "ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */ "ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */ "ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */ "ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */ "ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */ "ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */ "ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */ "ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */ "ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */ "ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */ "ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */ "ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */ "ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */ "ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */ "ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */ "ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */ "ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */ "ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */ "ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */ "ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */ "ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */ "ʼn", /* ʼN (2->3), +1 bytes, +1 chars */ "ǰ", /* J̌ (2->3), +1 bytes, +1 chars */ "ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */ "ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */ "ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */ "ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */ "ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */ "ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */ "ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */ "ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */ "ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */ "ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */ "ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */ "ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */ "ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */ "ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */ "ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */ "ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */ "ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */ "ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */ "ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */ "ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */ "ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */ ];