#!/usr/bin/python #-*- coding: utf-8 -*- import urllib2 import cookielib import sys import random import time from HTMLParser import HTMLParser AXE_URL = 'http://axe-level-4.herokuapp.com/lv4' UA = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' REF = lambda num: '%s/?page=%s' % (AXE_URL, num) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) def get_page_raw_data(ref=None, axeurl='%s/?page=1' % AXE_URL): url = axeurl time.sleep(random.random()*2**2) request = urllib2.Request(url) request.add_header('User-Agent', UA) if ref: request.add_header('Referer', ref) response = urllib2.urlopen(request) html = response.read() return html def write(value=''): sys.stdout.write(str(value)) class MyHTMLParser4(HTMLParser): row, col = 0, 0 fm = ['"town": "', '"village": "', '"name" : "'] def handle_starttag(self, tag, attrs): if self.row: if tag == 'tr': if self.row > 1: write(', ') write('{') if tag == 'td': write(self.fm[self.col]) def handle_endtag(self, tag): if tag == 'table': self.row = 0 if self.row: if tag == 'td': if self.col == 2: write('"') else: write('", ') self.col = self.col + 1 if tag == 'tr': write('}') self.col = 0 # skip header if tag == 'tr': self.row = self.row + 1 def handle_data(self, data): if self.row: write(data.strip()) parser = MyHTMLParser4() write('[') ref = AXE_URL+'/' html = get_page_raw_data(axeurl=ref) parser.feed(html) write(', ') html = get_page_raw_data(ref, axeurl=REF(2)) parser.feed(html) write(', ') for i in range(3, 24): axeurl=REF(i) html = get_page_raw_data(ref=REF(i-1), axeurl=axeurl) parser.feed(html) write(', ') html = get_page_raw_data(ref=REF(23), axeurl=REF(24)) parser.feed(html) write(']')