Skip to content

Instantly share code, notes, and snippets.

@roedoejet
Created September 16, 2024 20:11
Show Gist options
  • Select an option

  • Save roedoejet/d3a2e91b4bba32a4ddbcfbdba77a87bc to your computer and use it in GitHub Desktop.

Select an option

Save roedoejet/d3a2e91b4bba32a4ddbcfbdba77a87bc to your computer and use it in GitHub Desktop.
import re
from unicodedata import normalize
from nltk.tokenize import RegexpTokenizer
class Tokenizer:
def __init__(self, symbols: list[str]):
# NFC normalize and reverse sort by length
self.symbols = sorted(
[normalize("NFC", x) for x in symbols],
key=lambda symbol: (-len(symbol), symbol),
)
self.tokenizer = self.create_tokenizer(self.symbols)
def create_tokenizer(self, symbols: list[str]) -> RegexpTokenizer:
return RegexpTokenizer("|".join(re.escape(x) for x in symbols))
def tokenize(self, text: str) -> list[str]:
text = normalize("NFC", text)
return self.tokenizer.tokenize(text)
if __name__ == "__main__":
STR_CHARACTERS = [
"A",
"B",
"C",
"D",
"E",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"S",
"T",
"U",
"W",
"W̱",
"X",
"X̱",
"Y",
"¸",
"Á",
"Í",
"Ć",
"Ś",
"Ŧ",
"Ⱥ",
"Ȼ",
"Ƚ",
"Ⱦ",
"Ḱ",
"Ḵ",
"Ṉ",
"Ṯ",
"₭",
]
STR_TOKENIZER = Tokenizer(STR_CHARACTERS)
print(STR_TOKENIZER.tokenize("ÍY SȻÁĆEL"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment