bobby569 · June 20, 2020 21:56
diff --git a/urllib-extend.py b/urllib-extend.py
 import bs4
 import re
 import urllib.request


 def getFullHtml(url: str) -> str:
    """Retrieve the html content of the given url."""
    return urllib.request.urlopen(url).read().decode('utf-8')


 def getTags(url: str, tag: str) -> List[str]:
    """Retrieve the content for the specific html tag."""
    html = getFullHtml(url)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    return soup(tag)


 def getContent(tagArr: List[str], attr: str) -> List[str]:
    """Retrieve the attribute value for each tag in `tagArr`."""
    res = []
    for tag in tagArr:
        content = tag.get(attr)
        if content:
            res.append(content)
    return res


 def getLinks(start_url: str, depth=3) -> set:
    """Get all links encountered with upto `depth` in random order."""
    def getEmbedUrl(url: str) -> set:
        try:
            tags = getTags(url, 'a')
            content = getContent(tags, 'href')
            return {val for val in content if re.match(r'https?://', val)}
        except:
            return set()

    res = {start_url}
    queue = {start_url}
    for _ in range(depth):
        tmp = set()
        for url in queue:
            urls = getEmbedUrl(url)
            tmp |= {u for u in urls if u not in res}
        res |= tmp
        queue = tmp
    return res


 def getFile(url: str, extd: str) -> list:
    html = getFullHtml(url)
    data = html.split('\n')
    extd = '.{}'.format(extd)

    res = []
    for line in data:
        if extd in line:
            href = line.index('href')
            start = line.index('"', href) + 1
            end = line.index('"', start)
            filename = line[start:end]
            full_url = url + filename[1:] if filename.startswith('./') else filename
            res.append(full_url)
    return res
	import bs4
	import re
	import urllib.request


	def getFullHtml(url: str) -> str:
	"""Retrieve the html content of the given url."""
	return urllib.request.urlopen(url).read().decode('utf-8')


	def getTags(url: str, tag: str) -> List[str]:
	"""Retrieve the content for the specific html tag."""
	html = getFullHtml(url)
	soup = bs4.BeautifulSoup(html, 'html.parser')
	return soup(tag)


	def getContent(tagArr: List[str], attr: str) -> List[str]:
	"""Retrieve the attribute value for each tag in `tagArr`."""
	res = []
	for tag in tagArr:
	content = tag.get(attr)
	if content:
	res.append(content)
	return res


	def getLinks(start_url: str, depth=3) -> set:
	"""Get all links encountered with upto `depth` in random order."""
	def getEmbedUrl(url: str) -> set:
	try:
	tags = getTags(url, 'a')
	content = getContent(tags, 'href')
	return {val for val in content if re.match(r'https?://', val)}
	except:
	return set()

	res = {start_url}
	queue = {start_url}
	for _ in range(depth):
	tmp = set()
	for url in queue:
	urls = getEmbedUrl(url)
	tmp \|= {u for u in urls if u not in res}
	res \|= tmp
	queue = tmp
	return res


	def getFile(url: str, extd: str) -> list:
	html = getFullHtml(url)
	data = html.split('\n')
	extd = '.{}'.format(extd)

	res = []
	for line in data:
	if extd in line:
	href = line.index('href')
	start = line.index('"', href) + 1
	end = line.index('"', start)
	filename = line[start:end]
	full_url = url + filename[1:] if filename.startswith('./') else filename
	res.append(full_url)
	return res
No results found