Created
August 6, 2019 10:19
-
-
Save dgaitsgo/cbe3296b2cc57f709eeccba43dbbd570 to your computer and use it in GitHub Desktop.
Fetching and parsing CDC for cumulative year to date total measles cases by state, territory and region
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| # In[30]: | |
| import os | |
| import json | |
| from contextlib import closing | |
| from bs4 import BeautifulSoup | |
| import csv | |
| # In[7]: | |
| from requests import get | |
| from requests.exceptions import RequestException | |
| from contextlib import closing | |
| def simple_get(url): | |
| """ | |
| Attempts to get the content at `url` by making an HTTP GET request. | |
| If the content-type of response is some kind of HTML/XML, return the | |
| text content, otherwise return None. | |
| """ | |
| try: | |
| with closing(get(url, stream=True)) as resp: | |
| if is_good_response(resp): | |
| return resp.content | |
| else: | |
| return None | |
| except RequestException as e: | |
| log_error('Error during requests to {0} : {1}'.format(url, str(e))) | |
| return None | |
| def is_good_response(resp): | |
| """ | |
| Returns True if the response seems to be HTML, False otherwise. | |
| """ | |
| content_type = resp.headers['Content-Type'].lower() | |
| return (resp.status_code == 200 | |
| and content_type is not None | |
| and content_type.find('html') > -1) | |
| def log_error(e): | |
| print(e) | |
| # In[8]: | |
| #get latest data | |
| currWeek="30" | |
| currYear="2019" | |
| latestLink = f'https://wonder.cdc.gov/nndss/static/{currYear}/{currWeek}/{currYear}-{currWeek}-table1v.html' | |
| raw = simple_get(latestLink) | |
| # In[10]: | |
| #make a tree | |
| tree = BeautifulSoup(raw, 'html.parser') | |
| # In[52]: | |
| # parse and export | |
| csvHeader = ["reporting_area", "cum_ytd_indigenous", "cum_ytd_imported"] | |
| rows = tree.find_all('tr') | |
| with open('state-cum-measles.csv', 'w', newline='') as csvfile: | |
| data = csv.writer(csvfile) | |
| data.writerow(csvHeader) | |
| for i in range(3, len(rows)): | |
| curr_row = rows[i] | |
| reporting_area = curr_row.find('th').text | |
| cum_ytd_indigenous = curr_row.select('td[headers*=SH2-3]')[0].text | |
| cum_ytd_imported = curr_row.select('td[headers*=SH3-3]')[0].text | |
| cum_ytd_indigenous = '0' if cum_ytd_indigenous == '-' else cum_ytd_indigenous | |
| cum_ytd_imported = '0' if cum_ytd_imported == '-' else cum_ytd_imported | |
| csvRow = [ | |
| reporting_area, | |
| cum_ytd_indigenous, | |
| cum_ytd_imported | |
| ] | |
| data.writerow(csvRow) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment