#!/usr/bin/env python3 import requests, argparse import xml.etree.ElementTree as ET SITEMAP_URI = 'https://docs.aws.amazon.com/sitemap_index.xml' def main(): get_sitemap_and_parse(SITEMAP_URI) def get_sitemap_and_parse(sitemap_uri): # Recurisively get the sitemap and parse it try: res = requests.get(sitemap_uri, allow_redirects=False) parse_sitemap_xml(res.text) except Exception as e: pass def parse_sitemap_xml(sitemap_data): root = ET.fromstring(sitemap_data) namespace = root.tag.split('}')[0].strip('{') for sitemap_child in root.findall(f'{{{namespace}}}sitemap/{{{namespace}}}loc'): get_sitemap_and_parse(sitemap_child.text) for sitemap_child in root.findall(f'{{{namespace}}}url/{{{namespace}}}loc'): print(sitemap_child.text) if __name__ == "__main__": main()