Skip to content

Instantly share code, notes, and snippets.

@bobby569
Created June 20, 2020 21:56
Show Gist options
  • Select an option

  • Save bobby569/4baed0eeb8f6f667fab6c01d79650656 to your computer and use it in GitHub Desktop.

Select an option

Save bobby569/4baed0eeb8f6f667fab6c01d79650656 to your computer and use it in GitHub Desktop.
Extended usage of urllib with assist of BeautifulSoup.
import bs4
import re
import urllib.request
def getFullHtml(url: str) -> str:
"""Retrieve the html content of the given url."""
return urllib.request.urlopen(url).read().decode('utf-8')
def getTags(url: str, tag: str) -> List[str]:
"""Retrieve the content for the specific html tag."""
html = getFullHtml(url)
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup(tag)
def getContent(tagArr: List[str], attr: str) -> List[str]:
"""Retrieve the attribute value for each tag in `tagArr`."""
res = []
for tag in tagArr:
content = tag.get(attr)
if content:
res.append(content)
return res
def getLinks(start_url: str, depth=3) -> set:
"""Get all links encountered with upto `depth` in random order."""
def getEmbedUrl(url: str) -> set:
try:
tags = getTags(url, 'a')
content = getContent(tags, 'href')
return {val for val in content if re.match(r'https?://', val)}
except:
return set()
res = {start_url}
queue = {start_url}
for _ in range(depth):
tmp = set()
for url in queue:
urls = getEmbedUrl(url)
tmp |= {u for u in urls if u not in res}
res |= tmp
queue = tmp
return res
def getFile(url: str, extd: str) -> list:
html = getFullHtml(url)
data = html.split('\n')
extd = '.{}'.format(extd)
res = []
for line in data:
if extd in line:
href = line.index('href')
start = line.index('"', href) + 1
end = line.index('"', start)
filename = line[start:end]
full_url = url + filename[1:] if filename.startswith('./') else filename
res.append(full_url)
return res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment