gregorynicholas · April 18, 2012 00:58
diff --git a/imagedownloader.py b/imagedownloader.py
 # imageDownloader.py
 # Finds and downloads all images from any given URL recursively.
 # FB - 201009094
 import urllib2
 from os.path import basename
 import urlparse
 from BeautifulSoup import BeautifulSoup # for HTML parsing

 global urlList
 urlList = []

 # recursively download images starting from the root URL
 def downloadImages(url, level, minFileSize): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
        print url
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # find and download all images
    imgTags = soup.findAll('img')
    for imgTag in imgTags:
        imgUrl = imgTag['src']
        # download only the proper image files
        if imgUrl.lower().endswith('.jpeg') or \
            imgUrl.lower().endswith('.jpg') or \
            imgUrl.lower().endswith('.gif') or \
            imgUrl.lower().endswith('.png') or \
            imgUrl.lower().endswith('.bmp'):
            try:
                imgData = urllib2.urlopen(imgUrl).read()
                if len(imgData) >= minFileSize:
                    print "    " + imgUrl
                    fileName = basename(urlsplit(imgUrl)[2])
                    output = open(fileName,'wb')
                    output.write(imgData)
                    output.close()
            except:
                pass
    print
    print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    downloadImages(linkUrl, level - 1, minFileSize)
                except:
                    pass

 # main
 rootUrl = 'http://www.yahoo.com'
 netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
 global website
 website = netloc[-2] + netloc[-1]
 downloadImages(rootUrl, 1, 50000)
	# imageDownloader.py
	# Finds and downloads all images from any given URL recursively.
	# FB - 201009094
	import urllib2
	from os.path import basename
	import urlparse
	from BeautifulSoup import BeautifulSoup # for HTML parsing

	global urlList
	urlList = []

	# recursively download images starting from the root URL
	def downloadImages(url, level, minFileSize): # the root URL is level 0
	# do not go to other websites
	global website
	netloc = urlparse.urlsplit(url).netloc.split('.')
	if netloc[-2] + netloc[-1] != website:
	return

	global urlList
	if url in urlList: # prevent using the same URL again
	return

	try:
	urlContent = urllib2.urlopen(url).read()
	urlList.append(url)
	print url
	except:
	return

	soup = BeautifulSoup(''.join(urlContent))
	# find and download all images
	imgTags = soup.findAll('img')
	for imgTag in imgTags:
	imgUrl = imgTag['src']
	# download only the proper image files
	if imgUrl.lower().endswith('.jpeg') or \
	imgUrl.lower().endswith('.jpg') or \
	imgUrl.lower().endswith('.gif') or \
	imgUrl.lower().endswith('.png') or \
	imgUrl.lower().endswith('.bmp'):
	try:
	imgData = urllib2.urlopen(imgUrl).read()
	if len(imgData) >= minFileSize:
	print " " + imgUrl
	fileName = basename(urlsplit(imgUrl)[2])
	output = open(fileName,'wb')
	output.write(imgData)
	output.close()
	except:
	pass
	print
	print

	# if there are links on the webpage then recursively repeat
	if level > 0:
	linkTags = soup.findAll('a')
	if len(linkTags) > 0:
	for linkTag in linkTags:
	try:
	linkUrl = linkTag['href']
	downloadImages(linkUrl, level - 1, minFileSize)
	except:
	pass

	# main
	rootUrl = 'http://www.yahoo.com'
	netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
	global website
	website = netloc[-2] + netloc[-1]
	downloadImages(rootUrl, 1, 50000)
No results found