Last active
January 4, 2018 01:27
-
-
Save alan-ho/513b4301fbe9bc57a32df83a2b79c666 to your computer and use it in GitHub Desktop.
Creating scrapers and using them
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib.request import urlopen | |
| from bs4 import BeautifulSoup | |
| import datetime | |
| import random | |
| import re | |
| random.seed(datetime.datetime.now()) | |
| def getLinks(articleUrl): | |
| html = urlopen("http://en.wikipedia.org"+articleUrl) | |
| bsObj = BeautifulSoup(html, 'html.parser') | |
| return bsObj.find("div", {"id":"bodyContent"}).findAll("a", | |
| href=re.compile("^(/wiki/)((?!:).)*$")) | |
| links = getLinks("/wiki/Kevin_Bacon") | |
| while len(links) > 0: | |
| newArticle = links[random.randint(0, len(links)-1)].attrs["href"] | |
| print(newArticle) | |
| links = getLinks(newArticle) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib.request import urlopen | |
| from bs4 import BeautifulSoup | |
| import re | |
| pages = set() | |
| def getLinks(pageUrl): | |
| global pages | |
| html = urlopen("http://en.wikipedia.org"+pageUrl) | |
| bsObj = BeautifulSoup(html, 'html.parser') | |
| for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")): | |
| if 'href' in link.attrs: | |
| if link.attrs['href'] not in pages: | |
| #We have encountered a new page | |
| newPage = link.attrs['href'] | |
| print(newPage) | |
| pages.add(newPage) | |
| getLinks(newPage) | |
| getLinks("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment