Created
November 26, 2017 19:06
-
-
Save kyawkn/bb2c3f6d5e181a8aade0224553878040 to your computer and use it in GitHub Desktop.
Python BeautifulSoup Scraper that scrapes book covers, titles and authors from www.goodreads.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Author @ Kyaw Khant Nyar | |
| github: kyawkn | |
| """ | |
| import requests | |
| import csv | |
| from bs4 import BeautifulSoup as bs | |
| import urllib | |
| import os | |
| def scrape_and_run(genre): | |
| # scrape on goodreads.com using desire genre type or key word | |
| # and save the titles and autors in a csv file | |
| page = requests.get("https://www.goodreads.com/shelf/show/" + genre) | |
| soup = bs(page.content, 'html.parser') | |
| titles = soup.find_all('a', class_='bookTitle') | |
| authors = soup.find_all('a', class_='authorName') | |
| image_dir = os.getcwd() + "/images/" + genre | |
| ## check if the desire genre path exists | |
| ## create a new one if it doesnt | |
| if not os.path.exists(image_dir): | |
| os.makedirs(image_dir) | |
| with open(genre + '.csv', 'w') as csvfile: | |
| fieldnames = ['title', 'author'] | |
| csv_write = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| books_save = 0 | |
| for title, author in zip(titles, authors): | |
| try: | |
| ## single book page | |
| book_page = requests.get("https://www.goodreads.com" + title['href']) | |
| soup = bs(book_page.content, 'html.parser') | |
| # get image id | |
| image = soup.find('img', id='coverImage') | |
| title_name = title.get_text() | |
| save_dir = image_dir + "/" + title_name | |
| urllib.request.urlretrieve(image['src'], save_dir) | |
| csv_write.writerow({'title': title_name, 'author': author.get_text()}) | |
| books_save += 1 | |
| ## error handelling for long file names | |
| except OSError as exc: | |
| if exc.errno == 36: | |
| print(exc) | |
| print("%d %s books saved." % (books_save, genre)) # books count feedback | |
| if __name__ == '__main__': | |
| ## run ifinite till user tells you to stop | |
| ## to avoid having to compile again and again | |
| while True: | |
| genre = input("Enter the genre (or quit to stop): ").lower() # input case lowered | |
| if(genre == "quit"): | |
| break | |
| else: | |
| scrape_and_run(genre) |
I also tried to modify your code above to scrape multiple pages on goodreads and for some reason I am only able to get results from page 1. The url that I am passing is correct. The same URL when pasted into the browser returns the expected page results. Any help will be much appreciated.
@Ak1904rh Did you ever figure out the page 1 problem? I'm currently struggling with the same issue!
@Ak1904rh @melaniewalsh Did you figure it out? I really want this right now.
@Anitha-Selvan @Ak1904rh I ended up collaborating on a different Goodreads scraper with @maria-antoniak! https://github.com/maria-antoniak/goodreads-scraper We don't scrape book covers, but maybe that's something we could do in the future...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
This is a great example and really taught me a lot about web scraping! many thanks for your efforts. I do have a further question for you. I am trying to extend this code of yours to return data from multiple pages. I can get the right URL (with the updated page = ) appended to the URL. However, I am only getting back data from page 1. I have signed into goodreads on my laptop.
Any guidance you can provide will be much appreciated.
Thanks in advance