-
-
Save ktmud/4381093 to your computer and use it in GitHub Desktop.
Revisions
-
ktmud revised this gist
Dec 26, 2012 . 1 changed file with 13 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,8 @@ # -*- encoding:utf-8 """ (c) https://gist.github.com/577116 HTML5 microdata parser for python 2.x/3.x - it requires lxml - microdata specification: http://dev.w3.org/html5/md/ @@ -26,7 +30,6 @@ def __init__(self, doc, uri=""): self.url = urljoin self.datetime = lambda dt: dt self.text = lambda t: t def items(self, types=None): ret = [] @@ -61,7 +64,9 @@ def parse_item_elem(self, elem, item): self.parse_item_props(child, props, None) pass #item["properties"] = props for k,v in props.items(): item[k] = v attrs = elem.keys() if "itemid" in attrs: item["id"] = elem.get("itemid") if "itemtype" in attrs: item["type"] = elem.get("itemtype") @@ -92,9 +97,10 @@ def parse_item_props(self, elem, props, ref): pass pass if elem.get("itemscope") is None: for child in elem.getchildren(): self.parse_item_props(child, props, ref) pass return def parse_value(self, elem, ref, names): @@ -201,4 +207,4 @@ def find_base(self): </html>""" ls = items(html) pprint(ls) pass -
ktmud revised this gist
Dec 26, 2012 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -137,7 +137,7 @@ def store_cache(self, ref, names, value): def to_text(self, elem): ret = elem.text or "" for child in elem.getchildren(): ret += self.to_text(child) ret += child.tail or "" pass return ret -
bellbind revised this gist
Sep 13, 2010 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,7 +4,7 @@ - microdata specification: http://dev.w3.org/html5/md/ """ try: from urllib.parse import urljoin except: from urlparse import urljoin import lxml.html as lhtml def items(html, types=None, uri=""): -
bellbind revised this gist
Sep 13, 2010 . 1 changed file with 4 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,12 +12,13 @@ def items(html, types=None, uri=""): returns [{"properties": {name: [val1, ...], ...}, "id": id, "type": type}, ...] """ doc = lhtml.fromstring(html) return Microdata(doc, uri).items(types) class Microdata(object): def __init__(self, doc, uri=""): self.base = uri self.doc = doc self.find_base() self.cache = {} -
bellbind revised this gist
Sep 13, 2010 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -103,7 +103,6 @@ def parse_value(self, elem, ref, names): return self.parse_item_elem(elem, item) # from http://dev.w3.org/html5/md/#values tag = elem.tag if tag == "meta": value = self.text(elem.get("content")) elif tag in self.src_tags: -
bellbind revised this gist
Sep 13, 2010 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -185,13 +185,13 @@ def find_base(self): rec ref </div> <div itemprop="friend" itemscope itemref="saburo"> ref </div> </div> <div id="saburo"> <div itemprop="name">Saburo</div> <div itemprop="friend" itemscope itemref="saburo"> self rec ref </div> <div itemprop="friend" itemscope itemref="shiro"> no ref -
bellbind revised this gist
Sep 13, 2010 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -190,11 +190,11 @@ def find_base(self): </div> <div id="saburo"> <div itemprop="name">Saburo</div> <div itemprop="friend" itemscope itemref="saburo"> rec ref </div> <div itemprop="friend" itemscope itemref="shiro"> no ref </div> </div> </body> -
bellbind created this gist
Sep 13, 2010 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,204 @@ """HTML5 microdata parser for python 2.x/3.x - it requires lxml - microdata specification: http://dev.w3.org/html5/md/ """ try: from urllib.parse import urljoin except:from urlparse import urljoin import lxml.html as lhtml def items(html, types=None, uri=""): """list microdata as standard data types returns [{"properties": {name: [val1, ...], ...}, "id": id, "type": type}, ...] """ return Microdata(html, uri).items(types) class Microdata(object): def __init__(self, html, uri=""): self.base = uri self.doc = lhtml.fromstring(html) self.find_base() self.cache = {} # data factory self.url = urljoin self.datetime = lambda dt: dt self.text = lambda t: t pass def items(self, types=None): ret = [] for elem in self.item_elems(self.doc, types): item = self.parse_item_elem(elem, {}) ret.append(item) pass return ret def item_elems(self, elem, types=None): "iterate top-level items of elements" if (elem.get("itemscope") is not None and elem.get("itemprop") is None): if not types or elem.get("itemtype") in types: yield elem pass for child in elem.getchildren(): for _ in self.item_elems(child, types): yield _ pass return def parse_item_elem(self, elem, item): props = {} refs = elem.get("itemref") if refs is not None: for ref in refs.split(): self.parse_item_ref(props, ref) pass pass for child in elem.getchildren(): self.parse_item_props(child, props, None) pass item["properties"] = props attrs = elem.keys() if "itemid" in attrs: item["id"] = elem.get("itemid") if "itemtype" in attrs: item["type"] = elem.get("itemtype") return item def parse_item_ref(self, props, ref): if ref not in self.cache: self.cache[ref] = {} child = self.elem_by_id(self.doc, ref) self.parse_item_props(child, {}, ref) pass for name in self.cache[ref]: if name not in props: props[name] = [] props[name].extend(self.cache[ref][name]) pass return def parse_item_props(self, elem, props, ref): if elem is None: return propnames = elem.get("itemprop") if propnames: names = propnames.split() value = self.parse_value(elem, ref, names) for propname in names: if propname not in props: props[propname] = [] props[propname].append(value) pass pass for child in elem.getchildren(): self.parse_item_props(child, props, ref) pass return def parse_value(self, elem, ref, names): if elem.get("itemscope") is not None: item = {} self.store_cache(ref, names, item) return self.parse_item_elem(elem, item) # from http://dev.w3.org/html5/md/#values # TBD: make absolute URL for src, href, data values tag = elem.tag if tag == "meta": value = self.text(elem.get("content")) elif tag in self.src_tags: value = self.url(self.base, elem.get("src")) pass elif tag in self.href_tags: value = self.url(self.base, elem.get("href")) pass elif tag == "object": value = self.url(self.base, elem.get("data")) pass elif tag == "time" and "datetime" in elem.keys(): value = self.datetime(elem.get("datetime")) pass else: value = self.text(self.to_text(elem)) self.store_cache(ref, names, value) return value src_tags = ["audio", "embed", "iframe", "img", "source", "video"] href_tags = ["a", "area", "link"] def store_cache(self, ref, names, value): if ref and names: for name in names: if name not in self.cache[ref]: self.cache[ref][name] = [] self.cache[ref][name].append(value) pass pass return value def to_text(self, elem): ret = elem.text or "" for child in elem.getchildren(): ret += to_text(child) ret += child.tail or "" pass return ret def elem_by_id(self, elem, id): if elem.get("id") == id: return elem for child in elem.getchildren(): ret = self.elem_by_id(child, id) if ret is not None: return ret pass return None def find_base(self): if self.doc.tag != "html": return for head in self.doc.getchildren(): if head.tag != "head": continue for base in head.getchildren(): if base.tag != "base": continue uri = base.get("href") if uri is not None: self.base = urljoin(self.base, uri) return pass pass pass pass if __name__ == "__main__": from pprint import pprint html = """ <html> <body> <div itemscope itemref="taro" itemid=""> </div> <div id="taro"> <span itemprop="name">Taro</span> <div itemprop="age">18</div> <div itemprop="friend" itemscope itemref="jiro"> ref </div> </div> <div id="jiro"> <div itemprop="name">Jiro</div> <div itemprop="friend" itemscope itemref="taro"> rec ref </div> <div itemprop="friend" itemscope itemref="saburo"> rec ref </div> </div> <div id="saburo"> <div itemprop="name">Saburo</div> <div itemprop="friend" itemscope itemref="jiro"> rec ref </div> <div itemprop="friend" itemscope itemref="shiro"> rec ref </div> </div> </body> </html>""" ls = items(html) pprint(ls) pass