-
-
Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
Revisions
-
Tiago Alves Macambira revised this gist
May 6, 2013 . 1 changed file with 41 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,12 @@ #!/usr/bin/python # vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8: """A collection of string normalization routines. You are probably looking for normalize_string, that does an aggressive (but arguably sound) string normalization process. """ from HTMLParser import HTMLParser import re @@ -13,6 +19,28 @@ ORD_9 = ord(u'9') def try_redecode_utf8(s): """Try redecoding utf-8 data inside a (faux-)unicode string. >>> try_redecode_utf8(u'T\xc3\xaanis e Esporte') u'T\xeanis e Esporte' """ keep_going = True redecoded = s # Keep redecoding until redecoding fails or there is no difference in output while keep_going: try: if isinstance(s, unicode): redecoded = s.encode('latin1').decode('utf-8') elif isinstance(s, str): redecoded = s.decode('utf-8') keep_going = (s != redecoded) s = redecoded except: keep_going = False return redecoded class HTMLStripper(HTMLParser): "Remove tags and keeps HTML entities intact." def __init__(self): @@ -37,11 +65,6 @@ def get_data(self): return u''.join(self.fed) def isPlainASCIIAlphaNum(c): o = ord(c) if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)): @@ -65,6 +88,9 @@ def strip_html_and_convert_entities(html): # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string parser = HTMLStripper() parser.feed(html) # HTML parser breaks if parsing ends/EOF on a single-letter broken entities # such as 'at&t'. Adding an extra space fixes this. parser.feed(' ') parser.close() return parser.unescape(parser.get_data()) @@ -74,6 +100,9 @@ def normalize_case(s): def normalize_diacritics(input_str): # References: # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python nkfd_form = unicodedata.normalize('NFKD', unicode(input_str)) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) @@ -119,7 +148,9 @@ def normalize_whitespace(s): return s.strip() def normalize_string(s, fix_utf8=False): if fix_utf8: s = try_redecode_utf8(s) s = strip_html_and_convert_entities(s) s = normalize_case(s) s = normalize_diacritics(s) @@ -142,12 +173,14 @@ def main(): u"<a>Çine e<br>Foto", u'Cine\u65e5\u672c\u8a9eFoto', u'Carrinhos e Veículos', u'<a href="#">Cine <em>(é fóto¬ \u0394ημώ)</em></a>', u'Soul e R&B', # we used break on this one. u'T\xc3\xaanis e Esporte', ] from collections import defaultdict categories = defaultdict(list) for i in sample: n = normalize_string(i, fix_utf8=True) categories[n].append(i) for k, v in categories.items(): -
tmacam revised this gist
Apr 19, 2013 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -65,6 +65,7 @@ def strip_html_and_convert_entities(html): # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string parser = HTMLStripper() parser.feed(html) parser.close() return parser.unescape(parser.get_data()) @@ -135,6 +136,7 @@ def normalize_string(s): def main(): sample = [u"Cine e foto", u"Cine & foto", u"Cine&Foto", # BeautifulSoup breaks for this one. u"Cine+foto", u"Cíñe_e.foto", u"<a>Çine e<br>Foto", -
tmacam created this gist
Apr 19, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,158 @@ #!/usr/bin/python # vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8: from HTMLParser import HTMLParser import re import unicodedata ORD_A_MIN = ord(u'a') ORD_Z_MIN = ord(u'z') ORD_0 = ord(u'0') ORD_9 = ord(u'9') class HTMLStripper(HTMLParser): "Remove tags and keeps HTML entities intact." def __init__(self): self.reset() self.fed = [] def handle_starttag(self, tag, attrs): # We took the decision that all/any tag is a word-splitter and thus # is converted to spaces. self.fed.append(' ') def handle_data(self, d): self.fed.append(d) def handle_charref(self, number): self.fed.append('&#%s;' % number) def handle_entityref(self, name): self.fed.append('&%s;' % name) def get_data(self): return u''.join(self.fed) # References # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python def isPlainASCIIAlphaNum(c): o = ord(c) if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)): return True return False def strip_html_and_convert_entities(html): # Previously we used the following code, that depends on BeautifulSoup: # # soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) # return u' '.join(soup.findAll(text=True)) # Src: # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python # # But it does not handle numeric character entities correctly. Our new # Approach does not depend on BeautifulSoup and uses HTMLParser, which is # part of python 2.6's std. library. So it is a double-win. :-) # # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string parser = HTMLStripper() parser.feed(html) return parser.unescape(parser.get_data()) def normalize_case(s): return s.lower() def normalize_diacritics(input_str): nkfd_form = unicodedata.normalize('NFKD', unicode(input_str)) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) def normalize_to_plain_ascii(s): only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?' return unicode(only_ascii) def normalize_to_alphanum_and_spaces(s): return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s) # def normalize_diacritics_old(s): # """Converts to lowercase, normalizes diacritics and # converts non-alphanumeric chars into space. # """ # s = s.lower() # s = unicodedata.normalize('NFKD', s) # # Filter ascii letters and numbers, discard everyone else # filtered = [] # for c in s: # if isPlainASCIIAlphaNum(c): # filtered.append(c) # elif unicodedata.category(c) == 'Mn': # continue # else: # filtered.append(u' ') # return u' '.join(''.join(filtered).split()) def normalize_prepositions(s): """Replaces common prepositions by space.""" prepositions = ['e', 'and', 'de', 'do', 'da'] for prep in prepositions: pattern = r'\b' + prep + r'\b' s = re.sub(pattern, " ", s) return s def normalize_whitespace(s): s = re.sub('\s+', ' ', s) return s.strip() def normalize_string(s): s = strip_html_and_convert_entities(s) s = normalize_case(s) s = normalize_diacritics(s) s = normalize_to_plain_ascii(s) s = normalize_to_alphanum_and_spaces(s) #s = normalize_prepositions(s) s = normalize_whitespace(s) # We don't need to re-normalize to known unicode form (say, NFC) since we # only have plain ASCII data, and alpha-numeric content, for that matter. # There is no "combined" nor "decomposed" unicode content in `s`. return s def main(): sample = [u"Cine e foto", u"Cine & foto", u"Cine+foto", u"Cíñe_e.foto", u"<a>Çine e<br>Foto", u'Cine\u65e5\u672c\u8a9eFoto', u'Carrinhos e Veículos', u'<a href="#">Cine <em>(é fóto¬ \u0394ημώ)</em></a>' ] from collections import defaultdict categories = defaultdict(list) for i in sample: n = normalize_string(i) categories[n].append(i) for k, v in categories.items(): print k, v return categories if __name__ == '__main__': main()