import requests import csv import os import json import re from bs4 import BeautifulSoup import mechanize from random import choice user_agents = ['Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7','Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)','Opera/9.52 (X11; Linux i686; U; en)'] random_user_agent = choice(user_agents) import urllib def all_great_movies(): ebert_url = "https://www.rogerebert.com/great-movies?utf8=%E2%9C%93&filters%5Btitle%5D=&sort%5Border%5D=newest&filters%5Byears%5D%5B%5D=1914&filters%5Byears%5D%5B%5D=2020&filters%5Bstar_rating%5D%5B%5D=0.0&filters%5Bstar_rating%5D%5B%5D=4.0&filters%5Bno_stars%5D=1&page={}" curr_page = 1 headers = {'accept': 'application/json'} while True: print("Parsing page {}".format(curr_page)) data = [] response = requests.get(ebert_url.format(curr_page), headers=headers) data_soup = BeautifulSoup(response.json()['html'], features="html.parser") reviews = data_soup.find_all("div", class_="review-stack") for review in reviews: title = review.find("h5", class_="review-stack--title") title_anchor = title.find("a") review_link = title_anchor['href'] review_text = title_anchor.text data.append({ "review_url": review_link, "title": review_text }) if data: print("{} movies saved".format(len(data))) with open("data/{}.json".format(curr_page), 'w') as f: f.write(json.dumps(data)) curr_page += 1 else: break def amazon_search(movie_title): print("Searching movie: {}".format(movie_title)) url = "https://www.amazon.com/s?k={}&i=instant-video" search_key = urllib.parse.quote_plus(movie_title.lower()) end = url.format(search_key) br = mechanize.Browser() br.addheaders = [('User-Agent', random_user_agent)] response = br.open(end) data_soup = BeautifulSoup(response.get_data(), features="html.parser") txt = str(data_soup).replace("\n", "") res = data_soup.find_all() import re rgex = re.compile(r"""