Skip to content

Instantly share code, notes, and snippets.

@ellipse42-archived
Last active August 29, 2015 14:21
Show Gist options
  • Select an option

  • Save ellipse42-archived/cbb3b1a2dedd420a5fda to your computer and use it in GitHub Desktop.

Select an option

Save ellipse42-archived/cbb3b1a2dedd420a5fda to your computer and use it in GitHub Desktop.
# coding=utf-8
from __future__ import unicode_literals
import re
import sys
import time
import traceback
from datetime import datetime
from Queue import Queue
from multiprocessing.dummy import Pool as ThreadPool
import pymongo
import requests
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
country_db = pymongo.MongoClient().qyer.country
poi_db = pymongo.MongoClient().qyer.poi
tasks = []
def get_countries():
result = []
url = 'http://place.qyer.com/'
res = requests.get(url)
soup = BeautifulSoup(res.content)
continents = soup.select(
'body > div.pla_indallworld > div.pla_indcountrylists > div')
if continents:
for continent in continents:
continent_id = continent.select(
'h2 > em > a')[0]['href'].split('/')[-2]
countries = continent.select('div > ul > li > a')
countries += continent.select('div > ul > li > p > a')
for country in countries:
country_id = country['href'].split('/')[-2]
zh_name = country.contents[0].strip()
en_name = country.contents[1].string.strip()
country_db.update({'_id': country_id}, {
'zh_name': zh_name, 'en_name': en_name, 'continent': continent_id}, upsert=True)
result.append(country_id)
return result
def _http_get(url):
while 1:
try:
res = requests.get(url)
if res.status_code == 200:
break
except:
traceback.print_exc()
time.sleep(1)
return res.content
def get_tasks(country):
result = []
url = 'http://place.qyer.com/%s/alltravel/' % country
soup = BeautifulSoup(_http_get(url))
r = soup.select('#poiSort > a')
if r:
for x in r:
try:
if x['href'].split('/')[-2] != 'alltravel':
nums = int(re.search('\d+', x.string).group())
for _ in [x['href'] + '?page=%d' % (_ + 1) for _ in range((nums - 1) / 15) if nums > 0]:
tasks.append(_)
except:
traceback.print_exc()
def parse(url):
soup = BeautifulSoup(_http_get(url))
pois = soup.select('#poiLists > li')
if pois:
for poi in pois:
img_url, poi_id, zh_name, en_name = '', '', '', ''
category, country = url.split('/')[-2], url.split('/')[-3]
try:
img = poi.select('div > p.pics > a > img')
if img:
img_url = img[0]['src']
except:
pass
try:
ci = poi.select('div > h3 > a')
if ci:
poi_id = ci[0]['href'].split('/')[-2]
zh_name = ci[0].contents[0].strip()
en_name = ci[0].contents[1].string.strip()
except:
pass
if poi_id:
poi_db.update({'poi': poi_id}, {'poi': poi_id, 'zh_name': zh_name, 'en_name': en_name, 'category': category, 'country': country, 'img': img_url}, upsert=True)
if __name__ == '__main__':
countries = get_countries()
pool = ThreadPool(50)
pool.map(get_tasks, countries)
pool.close()
pool.join()
print len(tasks)
pool = ThreadPool(50)
pool.map(parse, tasks)
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment