rendello · December 25, 2025 23:54 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/utf8_case_data.rs → _utf8_case_data.rs b/utf8_case_data.rs → _utf8_case_data.rs
diff --git a/generate_utf8.py b/generate_utf8.py
@@ -1,3 +1,18 @@
+"""
+Copyright (c) 2024 Rendello
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+"""
+
 import sys
 from dataclasses import dataclass
 from typing import List, Dict

diff --git a/utf8_case_data.rs b/utf8_case_data.rs
@@ -13,10 +13,9 @@ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 PERFORMANCE OF THIS SOFTWARE.
 */
 
-// =======================================================================
-//! Unicode characters that behave oddly when the case is changed, for use
-//! with property tests.
-// =======================================================================
+// ==========================================================================
+//! Unicode codepoints that expand or contract when case is changed in UTF-8.
+// ==========================================================================
 
 pub const LOWERCASING_CONTRACTS: [&str; 22] = [
     "ẞ",	/* ß	(3->2), -1 bytes */

diff --git a/generate_utf8.py b/generate_utf8.py
@@ -0,0 +1,89 @@
+import sys
+from dataclasses import dataclass
+from typing import List, Dict
+
+@dataclass
+class Entry:
+    a: str
+    b: str
+    a_len: int
+    b_len: int
+    delta: int
+    a_char_count: int
+    b_char_count: int
+    delta_char_count: int
+
+
+def sort_entries(l: List[Entry]) -> List[Entry]:
+    """ Sorted by size delta, then alphabetically. """
+    return sorted(l, key=
+        lambda p: (-(p.delta_char_count), -(p.delta), p.a))
+
+
+def create_entry_map() -> dict[str, list[Entry]]:
+    entry_map = {}
+    for i in range(sys.maxunicode + 1):
+        a = chr(i)
+
+        for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())):
+            attributes = [case]
+
+            try:
+                a_len = len(a.encode("utf8"))
+                b_len = len(b.encode("utf8"))
+            except UnicodeEncodeError:
+                continue
+
+            if a_len == b_len:
+                continue
+
+            delta = b_len - a_len
+
+            a_char_count = len(a)
+            b_char_count = len(b)
+            delta_char_count = b_char_count - a_char_count
+
+            if a_len < b_len:
+                attributes.append('expands')
+            elif a_len > b_len:
+                attributes.append('contracts')
+
+            if b_char_count > 1:
+                attributes.append('multi_char')
+
+            key = "_".join(attributes)
+            value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count)
+
+            if key not in entry_map:
+                entry_map[key] = [value]
+            else:
+                entry_map[key].append(value)
+
+    return entry_map
+
+
+def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str:
+    buffer = (
+        f'''// =======================================================================\n'''
+        f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n'''
+        f'''//! Unicode characters that behave oddly when the case is changed, for use\n'''
+        f'''//! with property tests.\n'''
+        f'''// =======================================================================\n\n'''
+    )
+    for key, unsorted_entries in sorted(list(entry_map.items())):
+        entries = sort_entries(unsorted_entries)
+
+        buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n'
+        for e in entries:
+
+            ds = ""
+            if e.delta_char_count != 0:
+                ds = f", {e.delta_char_count:+} chars"
+
+            buffer += f'    "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n'
+        buffer += "];\n\n"
+    return buffer.strip()
+
+
+def generate_utf8_case_data():
+    return entry_map_to_string(create_entry_map())
diff --git a/utf8_case_data.rs b/utf8_case_data.rs
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2024 Gaven Rendell
+Copyright (c) 2024 Rendello
 
 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted.

diff --git a/utf8_case_data.rs b/utf8_case_data.rs
@@ -1,6 +1,4 @@
 /*
-BSD Zero Clause License
-
 Copyright (c) 2024 Gaven Rendell
 
 Permission to use, copy, modify, and/or distribute this software for any

diff --git a/utf8_case_data.rs b/utf8_case_data.rs
@@ -0,0 +1,192 @@
+/*
+BSD Zero Clause License
+
+Copyright (c) 2024 Gaven Rendell
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+// =======================================================================
+//! Unicode characters that behave oddly when the case is changed, for use
+//! with property tests.
+// =======================================================================
+
+pub const LOWERCASING_CONTRACTS: [&str; 22] = [
+    "ẞ",	/* ß	(3->2), -1 bytes */
+    "Ω",	/* ω	(3->2), -1 bytes */
+    "Å",	/* å	(3->2), -1 bytes */
+    "Ɫ",	/* ɫ	(3->2), -1 bytes */
+    "Ɽ",	/* ɽ	(3->2), -1 bytes */
+    "Ɑ",	/* ɑ	(3->2), -1 bytes */
+    "Ɱ",	/* ɱ	(3->2), -1 bytes */
+    "Ɐ",	/* ɐ	(3->2), -1 bytes */
+    "Ɒ",	/* ɒ	(3->2), -1 bytes */
+    "Ȿ",	/* ȿ	(3->2), -1 bytes */
+    "Ɀ",	/* ɀ	(3->2), -1 bytes */
+    "Ɥ",	/* ɥ	(3->2), -1 bytes */
+    "Ɦ",	/* ɦ	(3->2), -1 bytes */
+    "Ɜ",	/* ɜ	(3->2), -1 bytes */
+    "Ɡ",	/* ɡ	(3->2), -1 bytes */
+    "Ɬ",	/* ɬ	(3->2), -1 bytes */
+    "Ɪ",	/* ɪ	(3->2), -1 bytes */
+    "Ʞ",	/* ʞ	(3->2), -1 bytes */
+    "Ʇ",	/* ʇ	(3->2), -1 bytes */
+    "Ʝ",	/* ʝ	(3->2), -1 bytes */
+    "Ʂ",	/* ʂ	(3->2), -1 bytes */
+    "K",	/* k	(3->1), -2 bytes */
+];
+
+pub const LOWERCASING_EXPANDS: [&str; 2] = [
+    "Ⱥ",	/* ⱥ	(2->3), +1 bytes */
+    "Ⱦ",	/* ⱦ	(2->3), +1 bytes */
+];
+
+pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [
+    "İ",	/* i̇	(2->3), +1 bytes, +1 chars */
+];
+
+pub const UPPERCASING_CONTRACTS: [&str; 13] = [
+    "ı",	/* I	(2->1), -1 bytes */
+    "ſ",	/* S	(2->1), -1 bytes */
+    "ᲀ",	/* В	(3->2), -1 bytes */
+    "ᲁ",	/* Д	(3->2), -1 bytes */
+    "ᲂ",	/* О	(3->2), -1 bytes */
+    "ᲃ",	/* С	(3->2), -1 bytes */
+    "ᲄ",	/* Т	(3->2), -1 bytes */
+    "ᲅ",	/* Т	(3->2), -1 bytes */
+    "ᲆ",	/* Ъ	(3->2), -1 bytes */
+    "ᲇ",	/* Ѣ	(3->2), -1 bytes */
+    "ι",	/* Ι	(3->2), -1 bytes */
+    "ⱥ",	/* Ⱥ	(3->2), -1 bytes */
+    "ⱦ",	/* Ⱦ	(3->2), -1 bytes */
+];
+
+pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [
+    "ﬀ",	/* FF	(3->2), -1 bytes, +1 chars */
+    "ﬁ",	/* FI	(3->2), -1 bytes, +1 chars */
+    "ﬂ",	/* FL	(3->2), -1 bytes, +1 chars */
+    "ﬅ",	/* ST	(3->2), -1 bytes, +1 chars */
+    "ﬆ",	/* ST	(3->2), -1 bytes, +1 chars */
+];
+
+pub const UPPERCASING_EXPANDS: [&str; 18] = [
+    "ȿ",	/* Ȿ	(2->3), +1 bytes */
+    "ɀ",	/* Ɀ	(2->3), +1 bytes */
+    "ɐ",	/* Ɐ	(2->3), +1 bytes */
+    "ɑ",	/* Ɑ	(2->3), +1 bytes */
+    "ɒ",	/* Ɒ	(2->3), +1 bytes */
+    "ɜ",	/* Ɜ	(2->3), +1 bytes */
+    "ɡ",	/* Ɡ	(2->3), +1 bytes */
+    "ɥ",	/* Ɥ	(2->3), +1 bytes */
+    "ɦ",	/* Ɦ	(2->3), +1 bytes */
+    "ɪ",	/* Ɪ	(2->3), +1 bytes */
+    "ɫ",	/* Ɫ	(2->3), +1 bytes */
+    "ɬ",	/* Ɬ	(2->3), +1 bytes */
+    "ɱ",	/* Ɱ	(2->3), +1 bytes */
+    "ɽ",	/* Ɽ	(2->3), +1 bytes */
+    "ʂ",	/* Ʂ	(2->3), +1 bytes */
+    "ʇ",	/* Ʇ	(2->3), +1 bytes */
+    "ʝ",	/* Ʝ	(2->3), +1 bytes */
+    "ʞ",	/* Ʞ	(2->3), +1 bytes */
+];
+
+pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [
+    "ΐ",	/* Ϊ́	(2->6), +4 bytes, +2 chars */
+    "ΰ",	/* Ϋ́	(2->6), +4 bytes, +2 chars */
+    "ὒ",	/* Υ̓̀	(3->6), +3 bytes, +2 chars */
+    "ὔ",	/* Υ̓́	(3->6), +3 bytes, +2 chars */
+    "ὖ",	/* Υ̓͂	(3->6), +3 bytes, +2 chars */
+    "ᾷ",	/* Α͂Ι	(3->6), +3 bytes, +2 chars */
+    "ῇ",	/* Η͂Ι	(3->6), +3 bytes, +2 chars */
+    "ῒ",	/* Ϊ̀	(3->6), +3 bytes, +2 chars */
+    "ΐ",	/* Ϊ́	(3->6), +3 bytes, +2 chars */
+    "ῗ",	/* Ϊ͂	(3->6), +3 bytes, +2 chars */
+    "ῢ",	/* Ϋ̀	(3->6), +3 bytes, +2 chars */
+    "ΰ",	/* Ϋ́	(3->6), +3 bytes, +2 chars */
+    "ῧ",	/* Ϋ͂	(3->6), +3 bytes, +2 chars */
+    "ῷ",	/* Ω͂Ι	(3->6), +3 bytes, +2 chars */
+    "և",	/* ԵՒ	(2->4), +2 bytes, +1 chars */
+    "ᾀ",	/* ἈΙ	(3->5), +2 bytes, +1 chars */
+    "ᾁ",	/* ἉΙ	(3->5), +2 bytes, +1 chars */
+    "ᾂ",	/* ἊΙ	(3->5), +2 bytes, +1 chars */
+    "ᾃ",	/* ἋΙ	(3->5), +2 bytes, +1 chars */
+    "ᾄ",	/* ἌΙ	(3->5), +2 bytes, +1 chars */
+    "ᾅ",	/* ἍΙ	(3->5), +2 bytes, +1 chars */
+    "ᾆ",	/* ἎΙ	(3->5), +2 bytes, +1 chars */
+    "ᾇ",	/* ἏΙ	(3->5), +2 bytes, +1 chars */
+    "ᾈ",	/* ἈΙ	(3->5), +2 bytes, +1 chars */
+    "ᾉ",	/* ἉΙ	(3->5), +2 bytes, +1 chars */
+    "ᾊ",	/* ἊΙ	(3->5), +2 bytes, +1 chars */
+    "ᾋ",	/* ἋΙ	(3->5), +2 bytes, +1 chars */
+    "ᾌ",	/* ἌΙ	(3->5), +2 bytes, +1 chars */
+    "ᾍ",	/* ἍΙ	(3->5), +2 bytes, +1 chars */
+    "ᾎ",	/* ἎΙ	(3->5), +2 bytes, +1 chars */
+    "ᾏ",	/* ἏΙ	(3->5), +2 bytes, +1 chars */
+    "ᾐ",	/* ἨΙ	(3->5), +2 bytes, +1 chars */
+    "ᾑ",	/* ἩΙ	(3->5), +2 bytes, +1 chars */
+    "ᾒ",	/* ἪΙ	(3->5), +2 bytes, +1 chars */
+    "ᾓ",	/* ἫΙ	(3->5), +2 bytes, +1 chars */
+    "ᾔ",	/* ἬΙ	(3->5), +2 bytes, +1 chars */
+    "ᾕ",	/* ἭΙ	(3->5), +2 bytes, +1 chars */
+    "ᾖ",	/* ἮΙ	(3->5), +2 bytes, +1 chars */
+    "ᾗ",	/* ἯΙ	(3->5), +2 bytes, +1 chars */
+    "ᾘ",	/* ἨΙ	(3->5), +2 bytes, +1 chars */
+    "ᾙ",	/* ἩΙ	(3->5), +2 bytes, +1 chars */
+    "ᾚ",	/* ἪΙ	(3->5), +2 bytes, +1 chars */
+    "ᾛ",	/* ἫΙ	(3->5), +2 bytes, +1 chars */
+    "ᾜ",	/* ἬΙ	(3->5), +2 bytes, +1 chars */
+    "ᾝ",	/* ἭΙ	(3->5), +2 bytes, +1 chars */
+    "ᾞ",	/* ἮΙ	(3->5), +2 bytes, +1 chars */
+    "ᾟ",	/* ἯΙ	(3->5), +2 bytes, +1 chars */
+    "ᾠ",	/* ὨΙ	(3->5), +2 bytes, +1 chars */
+    "ᾡ",	/* ὩΙ	(3->5), +2 bytes, +1 chars */
+    "ᾢ",	/* ὪΙ	(3->5), +2 bytes, +1 chars */
+    "ᾣ",	/* ὫΙ	(3->5), +2 bytes, +1 chars */
+    "ᾤ",	/* ὬΙ	(3->5), +2 bytes, +1 chars */
+    "ᾥ",	/* ὭΙ	(3->5), +2 bytes, +1 chars */
+    "ᾦ",	/* ὮΙ	(3->5), +2 bytes, +1 chars */
+    "ᾧ",	/* ὯΙ	(3->5), +2 bytes, +1 chars */
+    "ᾨ",	/* ὨΙ	(3->5), +2 bytes, +1 chars */
+    "ᾩ",	/* ὩΙ	(3->5), +2 bytes, +1 chars */
+    "ᾪ",	/* ὪΙ	(3->5), +2 bytes, +1 chars */
+    "ᾫ",	/* ὫΙ	(3->5), +2 bytes, +1 chars */
+    "ᾬ",	/* ὬΙ	(3->5), +2 bytes, +1 chars */
+    "ᾭ",	/* ὭΙ	(3->5), +2 bytes, +1 chars */
+    "ᾮ",	/* ὮΙ	(3->5), +2 bytes, +1 chars */
+    "ᾯ",	/* ὯΙ	(3->5), +2 bytes, +1 chars */
+    "ᾲ",	/* ᾺΙ	(3->5), +2 bytes, +1 chars */
+    "ῂ",	/* ῊΙ	(3->5), +2 bytes, +1 chars */
+    "ῲ",	/* ῺΙ	(3->5), +2 bytes, +1 chars */
+    "ŉ",	/* ʼN	(2->3), +1 bytes, +1 chars */
+    "ǰ",	/* J̌	(2->3), +1 bytes, +1 chars */
+    "ὐ",	/* Υ̓	(3->4), +1 bytes, +1 chars */
+    "ᾳ",	/* ΑΙ	(3->4), +1 bytes, +1 chars */
+    "ᾴ",	/* ΆΙ	(3->4), +1 bytes, +1 chars */
+    "ᾶ",	/* Α͂	(3->4), +1 bytes, +1 chars */
+    "ᾼ",	/* ΑΙ	(3->4), +1 bytes, +1 chars */
+    "ῃ",	/* ΗΙ	(3->4), +1 bytes, +1 chars */
+    "ῄ",	/* ΉΙ	(3->4), +1 bytes, +1 chars */
+    "ῆ",	/* Η͂	(3->4), +1 bytes, +1 chars */
+    "ῌ",	/* ΗΙ	(3->4), +1 bytes, +1 chars */
+    "ῖ",	/* Ι͂	(3->4), +1 bytes, +1 chars */
+    "ῤ",	/* Ρ̓	(3->4), +1 bytes, +1 chars */
+    "ῦ",	/* Υ͂	(3->4), +1 bytes, +1 chars */
+    "ῳ",	/* ΩΙ	(3->4), +1 bytes, +1 chars */
+    "ῴ",	/* ΏΙ	(3->4), +1 bytes, +1 chars */
+    "ῶ",	/* Ω͂	(3->4), +1 bytes, +1 chars */
+    "ῼ",	/* ΩΙ	(3->4), +1 bytes, +1 chars */
+    "ﬓ",	/* ՄՆ	(3->4), +1 bytes, +1 chars */
+    "ﬔ",	/* ՄԵ	(3->4), +1 bytes, +1 chars */
+    "ﬕ",	/* ՄԻ	(3->4), +1 bytes, +1 chars */
+    "ﬖ",	/* ՎՆ	(3->4), +1 bytes, +1 chars */
+    "ﬗ",	/* ՄԽ	(3->4), +1 bytes, +1 chars */
+];
No results found