Created
April 21, 2026 14:33
-
-
Save minhnguyent546/6922f6214bb9d8005f661ca0fe3939db to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| # Adapted from: https://github.com/longstnguyen/ViRE/blob/main/src/vi_retrieval_eval/textnorm.py | |
| import re | |
| import unicodedata | |
| # Zero-width / invisible chars | |
| _ZW_CHARS = [ | |
| "\u200b", # zero-width space | |
| "\u200c", # ZWNJ | |
| "\u200d", # ZWJ | |
| "\ufeff", # BOM | |
| ] | |
| # Collapse all whitespace into a single space | |
| _WS_RE = re.compile(r"\s+", flags=re.UNICODE) | |
| # A broad emoji / pictograph range (covers most common emoji) | |
| _EMOJI_RE = re.compile( | |
| "[" # start char class | |
| "\U0001f600-\U0001f64f" # emoticons | |
| "\U0001f300-\U0001f5ff" # symbols & pictographs | |
| "\U0001f680-\U0001f6ff" # transport & map symbols | |
| "\U0001f1e0-\U0001f1ff" # flags | |
| "\U00002702-\U000027b0" # dingbats | |
| "\U000024c2-\U0001f251" # enclosed characters | |
| "]+", | |
| flags=re.UNICODE, | |
| ) | |
| # Control characters (ASCII 0..31, 127) — removed for cleanliness | |
| _CTRL_RE = re.compile(r"[\x00-\x1F\x7F]") | |
| def normalize_for_dedup( | |
| s: str, *, do_lower: bool = False, remove_emoji: bool = False | |
| ) -> str: | |
| """ | |
| Normalize a string for deduplication / matching: | |
| - Unicode normalize NFKC | |
| - remove control chars (ASCII control) | |
| - remove zero-width/invisible characters | |
| - optional: remove emoji/icons if remove_emoji=True | |
| - strip leading/trailing whitespace | |
| - collapse whitespace to a single space | |
| - optional: lowercase if do_lower=True | |
| """ | |
| if s is None: | |
| return "" | |
| s = unicodedata.normalize("NFKC", str(s)) | |
| # Remove control chars | |
| s = _CTRL_RE.sub("", s) | |
| # Remove zero-width/invisible chars | |
| for ch in _ZW_CHARS: | |
| s = s.replace(ch, "") | |
| # Optional: remove emoji/icon | |
| if remove_emoji: | |
| s = _EMOJI_RE.sub("", s) | |
| # Trim + collapse spaces | |
| s = s.strip() | |
| s = _WS_RE.sub(" ", s) | |
| # Optional lower | |
| if do_lower: | |
| s = s.lower() | |
| return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment