Skip to content

Instantly share code, notes, and snippets.

@markmysourcecode
Forked from tmacam/normalize_string.py
Created May 30, 2023 08:11
Show Gist options
  • Select an option

  • Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.

Select an option

Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.

Revisions

  1. Tiago Alves Macambira revised this gist May 6, 2013. 1 changed file with 41 additions and 8 deletions.
    49 changes: 41 additions & 8 deletions normalize_string.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,12 @@
    #!/usr/bin/python
    # vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:

    """A collection of string normalization routines.
    You are probably looking for normalize_string, that does an aggressive (but
    arguably sound) string normalization process.
    """


    from HTMLParser import HTMLParser
    import re
    @@ -13,6 +19,28 @@
    ORD_9 = ord(u'9')


    def try_redecode_utf8(s):
    """Try redecoding utf-8 data inside a (faux-)unicode string.
    >>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
    u'T\xeanis e Esporte'
    """
    keep_going = True
    redecoded = s
    # Keep redecoding until redecoding fails or there is no difference in output
    while keep_going:
    try:
    if isinstance(s, unicode):
    redecoded = s.encode('latin1').decode('utf-8')
    elif isinstance(s, str):
    redecoded = s.decode('utf-8')
    keep_going = (s != redecoded)
    s = redecoded
    except:
    keep_going = False
    return redecoded


    class HTMLStripper(HTMLParser):
    "Remove tags and keeps HTML entities intact."
    def __init__(self):
    @@ -37,11 +65,6 @@ def get_data(self):
    return u''.join(self.fed)


    # References
    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python


    def isPlainASCIIAlphaNum(c):
    o = ord(c)
    if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
    @@ -65,6 +88,9 @@ def strip_html_and_convert_entities(html):
    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
    parser = HTMLStripper()
    parser.feed(html)
    # HTML parser breaks if parsing ends/EOF on a single-letter broken entities
    # such as 'at&t'. Adding an extra space fixes this.
    parser.feed(' ')
    parser.close()
    return parser.unescape(parser.get_data())

    @@ -74,6 +100,9 @@ def normalize_case(s):


    def normalize_diacritics(input_str):
    # References:
    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

    @@ -119,7 +148,9 @@ def normalize_whitespace(s):
    return s.strip()


    def normalize_string(s):
    def normalize_string(s, fix_utf8=False):
    if fix_utf8:
    s = try_redecode_utf8(s)
    s = strip_html_and_convert_entities(s)
    s = normalize_case(s)
    s = normalize_diacritics(s)
    @@ -142,12 +173,14 @@ def main():
    u"<a>&Ccedil;ine&nbsp;e<br>Foto",
    u'Cine\u65e5\u672c\u8a9eFoto',
    u'Carrinhos e Ve&iacute;culos',
    u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
    u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
    u'Soul e R&B', # we used break on this one.
    u'T\xc3\xaanis e Esporte',
    ]
    from collections import defaultdict
    categories = defaultdict(list)
    for i in sample:
    n = normalize_string(i)
    n = normalize_string(i, fix_utf8=True)
    categories[n].append(i)

    for k, v in categories.items():
  2. @tmacam tmacam revised this gist Apr 19, 2013. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions normalize_string.py
    Original file line number Diff line number Diff line change
    @@ -65,6 +65,7 @@ def strip_html_and_convert_entities(html):
    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
    parser = HTMLStripper()
    parser.feed(html)
    parser.close()
    return parser.unescape(parser.get_data())


    @@ -135,6 +136,7 @@ def normalize_string(s):
    def main():
    sample = [u"Cine e foto",
    u"Cine & foto",
    u"Cine&Foto", # BeautifulSoup breaks for this one.
    u"Cine+foto",
    u"Cíñe_e.foto",
    u"<a>&Ccedil;ine&nbsp;e<br>Foto",
  3. @tmacam tmacam created this gist Apr 19, 2013.
    158 changes: 158 additions & 0 deletions normalize_string.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,158 @@
    #!/usr/bin/python
    # vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:


    from HTMLParser import HTMLParser
    import re
    import unicodedata


    ORD_A_MIN = ord(u'a')
    ORD_Z_MIN = ord(u'z')
    ORD_0 = ord(u'0')
    ORD_9 = ord(u'9')


    class HTMLStripper(HTMLParser):
    "Remove tags and keeps HTML entities intact."
    def __init__(self):
    self.reset()
    self.fed = []

    def handle_starttag(self, tag, attrs):
    # We took the decision that all/any tag is a word-splitter and thus
    # is converted to spaces.
    self.fed.append(' ')

    def handle_data(self, d):
    self.fed.append(d)

    def handle_charref(self, number):
    self.fed.append('&#%s;' % number)

    def handle_entityref(self, name):
    self.fed.append('&%s;' % name)

    def get_data(self):
    return u''.join(self.fed)


    # References
    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python


    def isPlainASCIIAlphaNum(c):
    o = ord(c)
    if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
    return True
    return False


    def strip_html_and_convert_entities(html):
    # Previously we used the following code, that depends on BeautifulSoup:
    #
    # soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    # return u' '.join(soup.findAll(text=True))
    # Src:
    # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    #
    # But it does not handle numeric character entities correctly. Our new
    # Approach does not depend on BeautifulSoup and uses HTMLParser, which is
    # part of python 2.6's std. library. So it is a double-win. :-)
    #
    # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
    parser = HTMLStripper()
    parser.feed(html)
    return parser.unescape(parser.get_data())


    def normalize_case(s):
    return s.lower()


    def normalize_diacritics(input_str):
    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


    def normalize_to_plain_ascii(s):
    only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?'
    return unicode(only_ascii)


    def normalize_to_alphanum_and_spaces(s):
    return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)


    # def normalize_diacritics_old(s):
    # """Converts to lowercase, normalizes diacritics and
    # converts non-alphanumeric chars into space.
    # """
    # s = s.lower()
    # s = unicodedata.normalize('NFKD', s)
    # # Filter ascii letters and numbers, discard everyone else
    # filtered = []
    # for c in s:
    # if isPlainASCIIAlphaNum(c):
    # filtered.append(c)
    # elif unicodedata.category(c) == 'Mn':
    # continue
    # else:
    # filtered.append(u' ')
    # return u' '.join(''.join(filtered).split())


    def normalize_prepositions(s):
    """Replaces common prepositions by space."""
    prepositions = ['e', 'and', 'de', 'do', 'da']
    for prep in prepositions:
    pattern = r'\b' + prep + r'\b'
    s = re.sub(pattern, " ", s)
    return s


    def normalize_whitespace(s):
    s = re.sub('\s+', ' ', s)
    return s.strip()


    def normalize_string(s):
    s = strip_html_and_convert_entities(s)
    s = normalize_case(s)
    s = normalize_diacritics(s)
    s = normalize_to_plain_ascii(s)
    s = normalize_to_alphanum_and_spaces(s)
    #s = normalize_prepositions(s)
    s = normalize_whitespace(s)
    # We don't need to re-normalize to known unicode form (say, NFC) since we
    # only have plain ASCII data, and alpha-numeric content, for that matter.
    # There is no "combined" nor "decomposed" unicode content in `s`.
    return s


    def main():
    sample = [u"Cine e foto",
    u"Cine & foto",
    u"Cine+foto",
    u"Cíñe_e.foto",
    u"<a>&Ccedil;ine&nbsp;e<br>Foto",
    u'Cine\u65e5\u672c\u8a9eFoto',
    u'Carrinhos e Ve&iacute;culos',
    u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
    ]
    from collections import defaultdict
    categories = defaultdict(list)
    for i in sample:
    n = normalize_string(i)
    categories[n].append(i)

    for k, v in categories.items():
    print k, v

    return categories


    if __name__ == '__main__':
    main()