Skip to content

Instantly share code, notes, and snippets.

@scruwys
Created May 15, 2015 20:27
Show Gist options
  • Select an option

  • Save scruwys/f8188ca1521777704f9d to your computer and use it in GitHub Desktop.

Select an option

Save scruwys/f8188ca1521777704f9d to your computer and use it in GitHub Desktop.
all_time_box_office.py
from bs4 import BeautifulSoup
import requests
import csv
import re
def write_to_tsv(filename, results):
with open(filename, 'ab') as outfile:
writer = csv.DictWriter(outfile, fieldnames=results[0].keys(), delimiter="\t")
writer.writerows(results)
if __name__ == '__main__':
resp = requests.get('http://www.imdb.com/boxoffice/alltimegross')
soup = BeautifulSoup(resp.text)
table = soup.find('div', id='main').find('table')
movies = []
for num, row in enumerate( table.find_all('tr')[1:] ):
cells = row.find_all('td')
pattern = re.compile(r'([^(]+)\s*\(([^)]+)\)\s*(?:,\s*|$)') # matches year within paranthesis
title = pattern.findall(cells[1].text.replace("'", ""))
movie = {
"Rank": num + 1,
"Title": title[0][0].encode('utf-8').strip(),
"Release Year": title[0][1].strip().replace("/I", ""),
"Box Office": float(cells[2].text.replace("$", "").replace(",", ""))
} # dictionary acts as a Movie "object"
movies.append(movie)
write_to_tsv('results.tsv', movies)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment