Skip to content

Instantly share code, notes, and snippets.

@breyten
Created August 9, 2016 09:37
Show Gist options
  • Select an option

  • Save breyten/4dfbd1d95fa23faeb88b71dd4cbc1563 to your computer and use it in GitHub Desktop.

Select an option

Save breyten/4dfbd1d95fa23faeb88b71dd4cbc1563 to your computer and use it in GitHub Desktop.

Revisions

  1. breyten created this gist Aug 9, 2016.
    74 changes: 74 additions & 0 deletions get_climates.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,74 @@
    #!/usr/bin/env python
    import os
    import sys
    import re
    from pprint import pprint
    import json
    from time import sleep

    import requests
    from BeautifulSoup import BeautifulSoup

    def get_countries():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_sovereign_states')
    soup = BeautifulSoup(resp.content)

    countries = {}
    for country_row in soup.find('table', 'wikitable').findAll('tr'):
    link = country_row.find('a', href=re.compile(r'^\/wiki\/.*'))
    if link is not None:
    countries[link['href']] = link.text
    return countries

    def get_climate_table(soup):
    for table in soup.findAll('table', 'wikitable'):
    thead = table.find('tr')
    if thead is None:
    continue
    thead_th = thead.find('th')
    if thead_th is None:
    continue
    if thead_th.text.startswith('Climate'): # bingo
    return table

    def get_climate_info(table):
    climate = {}
    month_row = table.findAll('tr')[1]
    months = [t.text for t in month_row.findAll('th')[1:]]

    for row in table.findAll('tr')[2:]:
    th = row.find('th')
    if th is None:
    continue
    cat_name = u' '.join(th.findAll(text=True))
    data = [td.text for td in row.findAll('td')]
    climate[cat_name] = dict(zip(months, data))
    return climate

    def get_country_climate(country_relative_link):
    resp = requests.get('http://en.wikipedia.org%s' % (country_relative_link,))
    soup = BeautifulSoup(resp.content)
    climate_table = get_climate_table(soup)
    if climate_table is not None:
    return get_climate_info(climate_table)
    else:
    return {}

    def main(argv=None):
    if argv is None:
    argv = sys.argv
    countries = get_countries()
    climates = {}
    for country_link, country_name in countries.iteritems():
    climate = get_country_climate(country_link)
    climates[country_name] = {
    'url': country_link,
    'climate': climate
    }
    print >>sys.stderr, country_name
    sleep(1)
    print json.dumps(climates)
    return 0

    if __name__ == '__main__':
    sys.exit(main())