Skip to content

Instantly share code, notes, and snippets.

@islamgab
Forked from ndunn219/sitemap_checker.py
Created February 13, 2019 21:57
Show Gist options
  • Select an option

  • Save islamgab/ec65ff30838863afa7902634f2ebccbd to your computer and use it in GitHub Desktop.

Select an option

Save islamgab/ec65ff30838863afa7902634f2ebccbd to your computer and use it in GitHub Desktop.

Revisions

  1. @ndunn219 ndunn219 revised this gist May 2, 2016. No changes.
  2. @ndunn219 ndunn219 revised this gist May 2, 2016. No changes.
  3. @ndunn219 ndunn219 revised this gist May 2, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion sitemap_checker.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    import requests, re
    import requests
    from bs4 import BeautifulSoup

    sitemap = 'http://www.nasa.gov/sitemap/sitemap_nasa.html'
  4. @ndunn219 ndunn219 revised this gist May 2, 2016. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions sitemap_checker.py
    Original file line number Diff line number Diff line change
    @@ -40,8 +40,8 @@
    i += 1
    print(i, end='. ')
    for response in result[1]:
    print(response.url, end='\n\t>>> ')
    print('>>>',result[3])
    print('>>', response.url, end='\n\t')
    print('>>>>',result[3])

    #non-200s
    print('\n==========\nERRORS')
  5. @ndunn219 ndunn219 created this gist May 2, 2016.
    50 changes: 50 additions & 0 deletions sitemap_checker.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,50 @@
    import requests, re
    from bs4 import BeautifulSoup

    sitemap = 'http://www.nasa.gov/sitemap/sitemap_nasa.html'

    r = requests.get(sitemap)
    html = r.content

    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
    urls = [link.get('href') for link in links
    if link.get('href') and link.get('href')[0:4]=='http']

    results = []
    for i, url in enumerate(urls,1):
    try:
    r = requests.get(url)
    report = str(r.status_code)
    if r.history:
    history_status_codes = [str(h.status_code) for h in r.history]
    report += ' [HISTORY: ' + ', '.join(history_status_codes) + ']'
    result = (r.status_code, r.history, url, 'No error. Redirect to ' + r.url)
    elif r.status_code == 200:
    result = (r.status_code, r.history, url, 'No error. No redirect.')
    else:
    result = (r.status_code, r.history, url, 'Error?')
    except Exception as e:
    result = (0, [], url, e)

    results.append(result)

    #Sort by status and then by history length
    results.sort(key=lambda result:(result[0],len(result[1])))

    #301s - may want to clean up 301s if you have multiple redirects
    print('301s')
    i = 0
    for result in results:
    if len(result[1]):
    i += 1
    print(i, end='. ')
    for response in result[1]:
    print(response.url, end='\n\t>>> ')
    print('>>>',result[3])

    #non-200s
    print('\n==========\nERRORS')
    for result in results:
    if result[0] != 200:
    print(result[0], '-', result[2])