Created
June 20, 2020 21:56
-
-
Save bobby569/4baed0eeb8f6f667fab6c01d79650656 to your computer and use it in GitHub Desktop.
Extended usage of urllib with assist of BeautifulSoup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import bs4 | |
| import re | |
| import urllib.request | |
| def getFullHtml(url: str) -> str: | |
| """Retrieve the html content of the given url.""" | |
| return urllib.request.urlopen(url).read().decode('utf-8') | |
| def getTags(url: str, tag: str) -> List[str]: | |
| """Retrieve the content for the specific html tag.""" | |
| html = getFullHtml(url) | |
| soup = bs4.BeautifulSoup(html, 'html.parser') | |
| return soup(tag) | |
| def getContent(tagArr: List[str], attr: str) -> List[str]: | |
| """Retrieve the attribute value for each tag in `tagArr`.""" | |
| res = [] | |
| for tag in tagArr: | |
| content = tag.get(attr) | |
| if content: | |
| res.append(content) | |
| return res | |
| def getLinks(start_url: str, depth=3) -> set: | |
| """Get all links encountered with upto `depth` in random order.""" | |
| def getEmbedUrl(url: str) -> set: | |
| try: | |
| tags = getTags(url, 'a') | |
| content = getContent(tags, 'href') | |
| return {val for val in content if re.match(r'https?://', val)} | |
| except: | |
| return set() | |
| res = {start_url} | |
| queue = {start_url} | |
| for _ in range(depth): | |
| tmp = set() | |
| for url in queue: | |
| urls = getEmbedUrl(url) | |
| tmp |= {u for u in urls if u not in res} | |
| res |= tmp | |
| queue = tmp | |
| return res | |
| def getFile(url: str, extd: str) -> list: | |
| html = getFullHtml(url) | |
| data = html.split('\n') | |
| extd = '.{}'.format(extd) | |
| res = [] | |
| for line in data: | |
| if extd in line: | |
| href = line.index('href') | |
| start = line.index('"', href) + 1 | |
| end = line.index('"', start) | |
| filename = line[start:end] | |
| full_url = url + filename[1:] if filename.startswith('./') else filename | |
| res.append(full_url) | |
| return res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment