markmysourcecode · May 30, 2023 08:11 · May 6, 2013 · Apr 19, 2013 · Apr 19, 2013
diff --git a/normalize_string.py b/normalize_string.py
@@ -1,6 +1,12 @@
 #!/usr/bin/python
 # vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:
 
+"""A collection of string normalization routines.
+
+You are probably looking for normalize_string, that does an aggressive (but
+arguably sound) string normalization process.
+"""
+
 
 from HTMLParser import HTMLParser
 import re
@@ -13,6 +19,28 @@
 ORD_9 = ord(u'9')
 
 
+def try_redecode_utf8(s):
+    """Try redecoding utf-8 data inside a (faux-)unicode string.
+
+    >>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
+    u'T\xeanis e Esporte'
+    """
+    keep_going = True
+    redecoded = s
+    # Keep redecoding until redecoding fails or there is no difference in output
+    while keep_going:
+        try:
+            if isinstance(s, unicode):
+                redecoded = s.encode('latin1').decode('utf-8')
+            elif isinstance(s, str):
+                redecoded = s.decode('utf-8')
+            keep_going = (s != redecoded)
+            s = redecoded
+        except:
+            keep_going = False
+    return redecoded
+
+
 class HTMLStripper(HTMLParser):
     "Remove tags and keeps HTML entities intact."
     def __init__(self):
@@ -37,11 +65,6 @@ def get_data(self):
         return u''.join(self.fed)
 
 
-# References
-# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
-# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
-
-
 def isPlainASCIIAlphaNum(c):
     o = ord(c)
     if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
@@ -65,6 +88,9 @@ def strip_html_and_convert_entities(html):
     # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
     parser = HTMLStripper()
     parser.feed(html)
+    # HTML parser breaks if parsing ends/EOF on a single-letter broken entities
+    # such as 'at&t'. Adding an extra space fixes this.
+    parser.feed(' ')
     parser.close()
     return parser.unescape(parser.get_data())
 
@@ -74,6 +100,9 @@ def normalize_case(s):
 
 
 def normalize_diacritics(input_str):
+    # References:
+    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
+    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
     nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
     return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
 
@@ -119,7 +148,9 @@ def normalize_whitespace(s):
     return s.strip()
 
 
-def normalize_string(s):
+def normalize_string(s, fix_utf8=False):
+    if fix_utf8:
+        s = try_redecode_utf8(s)
     s = strip_html_and_convert_entities(s)
     s = normalize_case(s)
     s = normalize_diacritics(s)
@@ -142,12 +173,14 @@ def main():
               u"<a>&Ccedil;ine&nbsp;e<br>Foto",
               u'Cine\u65e5\u672c\u8a9eFoto',
               u'Carrinhos e Ve&iacute;culos',
-              u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
+              u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
+              u'Soul e R&B',  # we used break on this one.
+              u'T\xc3\xaanis e Esporte',
              ]
     from collections import defaultdict
     categories = defaultdict(list)
     for i in sample:
-        n = normalize_string(i)
+        n = normalize_string(i, fix_utf8=True)
         categories[n].append(i)
 
     for k, v in categories.items():

diff --git a/normalize_string.py b/normalize_string.py
@@ -65,6 +65,7 @@ def strip_html_and_convert_entities(html):
     # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
     parser = HTMLStripper()
     parser.feed(html)
+    parser.close()
     return parser.unescape(parser.get_data())
 
 
@@ -135,6 +136,7 @@ def normalize_string(s):
 def main():
     sample = [u"Cine e foto",
               u"Cine & foto",
+              u"Cine&Foto",  # BeautifulSoup breaks for this one.
               u"Cine+foto",
               u"Cíñe_e.foto",
               u"<a>&Ccedil;ine&nbsp;e<br>Foto",

diff --git a/normalize_string.py b/normalize_string.py
@@ -0,0 +1,158 @@
+#!/usr/bin/python
+# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:
+
+
+from HTMLParser import HTMLParser
+import re
+import unicodedata
+
+
+ORD_A_MIN = ord(u'a')
+ORD_Z_MIN = ord(u'z')
+ORD_0 = ord(u'0')
+ORD_9 = ord(u'9')
+
+
+class HTMLStripper(HTMLParser):
+    "Remove tags and keeps HTML entities intact."
+    def __init__(self):
+        self.reset()
+        self.fed = []
+
+    def handle_starttag(self, tag, attrs):
+        # We took the decision that all/any tag is a word-splitter and thus
+        # is converted to spaces.
+        self.fed.append(' ')
+
+    def handle_data(self, d):
+        self.fed.append(d)
+
+    def handle_charref(self, number):
+        self.fed.append('&#%s;' % number)
+
+    def handle_entityref(self, name):
+        self.fed.append('&%s;' % name)
+
+    def get_data(self):
+        return u''.join(self.fed)
+
+
+# References
+# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
+# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
+
+
+def isPlainASCIIAlphaNum(c):
+    o = ord(c)
+    if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
+        return True
+    return False
+
+
+def strip_html_and_convert_entities(html):
+    # Previously we used the following code, that depends on BeautifulSoup:
+    #
+    #   soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
+    #   return u' '.join(soup.findAll(text=True))
+    # Src:
+    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
+    #
+    # But it does not handle numeric character entities correctly. Our new
+    # Approach does not depend on BeautifulSoup and uses HTMLParser, which is
+    # part of python 2.6's std. library. So it is a double-win. :-)
+    #
+    # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
+    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
+    parser = HTMLStripper()
+    parser.feed(html)
+    return parser.unescape(parser.get_data())
+
+
+def normalize_case(s):
+    return s.lower()
+
+
+def normalize_diacritics(input_str):
+    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
+    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
+
+
+def normalize_to_plain_ascii(s):
+    only_ascii = s.encode('ASCII', 'replace')  # unencodable chars become '?'
+    return unicode(only_ascii)
+
+
+def normalize_to_alphanum_and_spaces(s):
+    return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)
+
+
+# def normalize_diacritics_old(s):
+#     """Converts to lowercase, normalizes diacritics and
+#     converts non-alphanumeric chars into space.
+#     """
+#     s = s.lower()
+#     s = unicodedata.normalize('NFKD', s)
+#     # Filter ascii letters and numbers, discard everyone else
+#     filtered = []
+#     for c in s:
+#         if isPlainASCIIAlphaNum(c):
+#             filtered.append(c)
+#         elif unicodedata.category(c) == 'Mn':
+#             continue
+#         else:
+#             filtered.append(u' ')
+#     return u' '.join(''.join(filtered).split())
+
+
+def normalize_prepositions(s):
+    """Replaces common prepositions by space."""
+    prepositions = ['e', 'and', 'de', 'do', 'da']
+    for prep in prepositions:
+        pattern = r'\b' + prep + r'\b'
+        s = re.sub(pattern, " ", s)
+    return s
+
+
+def normalize_whitespace(s):
+    s = re.sub('\s+', ' ', s)
+    return s.strip()
+
+
+def normalize_string(s):
+    s = strip_html_and_convert_entities(s)
+    s = normalize_case(s)
+    s = normalize_diacritics(s)
+    s = normalize_to_plain_ascii(s)
+    s = normalize_to_alphanum_and_spaces(s)
+    #s = normalize_prepositions(s)
+    s = normalize_whitespace(s)
+    # We don't need to re-normalize to known unicode form (say, NFC) since we
+    # only have plain ASCII data, and alpha-numeric content, for that matter.
+    # There is no "combined" nor "decomposed" unicode content in `s`.
+    return s
+
+
+def main():
+    sample = [u"Cine e foto",
+              u"Cine & foto",
+              u"Cine+foto",
+              u"Cíñe_e.foto",
+              u"<a>&Ccedil;ine&nbsp;e<br>Foto",
+              u'Cine\u65e5\u672c\u8a9eFoto',
+              u'Carrinhos e Ve&iacute;culos',
+              u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
+             ]
+    from collections import defaultdict
+    categories = defaultdict(list)
+    for i in sample:
+        n = normalize_string(i)
+        categories[n].append(i)
+
+    for k, v in categories.items():
+        print k, v
+
+    return categories
+
+
+if __name__ == '__main__':
+    main()
No results found