Created
April 18, 2012 00:58
-
-
Save gregorynicholas/2410227 to your computer and use it in GitHub Desktop.
Finds and downloads all images from any given URL.: originally from:http://code.activestate.com/recipes/577385-image-downloader/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # imageDownloader.py | |
| # Finds and downloads all images from any given URL recursively. | |
| # FB - 201009094 | |
| import urllib2 | |
| from os.path import basename | |
| import urlparse | |
| from BeautifulSoup import BeautifulSoup # for HTML parsing | |
| global urlList | |
| urlList = [] | |
| # recursively download images starting from the root URL | |
| def downloadImages(url, level, minFileSize): # the root URL is level 0 | |
| # do not go to other websites | |
| global website | |
| netloc = urlparse.urlsplit(url).netloc.split('.') | |
| if netloc[-2] + netloc[-1] != website: | |
| return | |
| global urlList | |
| if url in urlList: # prevent using the same URL again | |
| return | |
| try: | |
| urlContent = urllib2.urlopen(url).read() | |
| urlList.append(url) | |
| print url | |
| except: | |
| return | |
| soup = BeautifulSoup(''.join(urlContent)) | |
| # find and download all images | |
| imgTags = soup.findAll('img') | |
| for imgTag in imgTags: | |
| imgUrl = imgTag['src'] | |
| # download only the proper image files | |
| if imgUrl.lower().endswith('.jpeg') or \ | |
| imgUrl.lower().endswith('.jpg') or \ | |
| imgUrl.lower().endswith('.gif') or \ | |
| imgUrl.lower().endswith('.png') or \ | |
| imgUrl.lower().endswith('.bmp'): | |
| try: | |
| imgData = urllib2.urlopen(imgUrl).read() | |
| if len(imgData) >= minFileSize: | |
| print " " + imgUrl | |
| fileName = basename(urlsplit(imgUrl)[2]) | |
| output = open(fileName,'wb') | |
| output.write(imgData) | |
| output.close() | |
| except: | |
| pass | |
| # if there are links on the webpage then recursively repeat | |
| if level > 0: | |
| linkTags = soup.findAll('a') | |
| if len(linkTags) > 0: | |
| for linkTag in linkTags: | |
| try: | |
| linkUrl = linkTag['href'] | |
| downloadImages(linkUrl, level - 1, minFileSize) | |
| except: | |
| pass | |
| # main | |
| rootUrl = 'http://www.yahoo.com' | |
| netloc = urlparse.urlsplit(rootUrl).netloc.split('.') | |
| global website | |
| website = netloc[-2] + netloc[-1] | |
| downloadImages(rootUrl, 1, 50000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment