#!/usr/bin/python #-*- coding: utf-8 -*- import urllib2 import cookielib import sys from HTMLParser import HTMLParser AXE_URL = 'http://axe-level-1.herokuapp.com/lv3' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) def get_page_raw_data(axe_url='%s/?page='%AXE_URL, page_num='next'): url = axe_url + page_num response = urllib2.urlopen(url) html = response.read() return html def write(value=''): sys.stdout.write(str(value)) class MyHTMLParser2(HTMLParser): row, col = 0, 0 fm = ['"town": "', '"village": "', '"name" : "'] def handle_starttag(self, tag, attrs): if self.row: if tag == 'tr': if self.row > 1: write(', ') write('{') if tag == 'td': write(self.fm[self.col]) def handle_endtag(self, tag): if tag == 'table': self.row = 0 if self.row: if tag == 'td': if self.col == 2: write('"') else: write('", ') self.col = self.col + 1 if tag == 'tr': write('}') self.col = 0 # skip header if tag == 'tr': self.row = self.row + 1 def handle_data(self, data): if self.row: write(data.strip()) parser = MyHTMLParser2() write('[') html = get_page_raw_data(axe_url=AXE_URL, page_num='') parser.feed(html) write(', ') for i in range(1,76): html = get_page_raw_data() parser.feed(html) if i < 75: write(', ') write(']') #[{"town": "東區", "village": "東勢里", "name" : "林錦全"}, ...]