Skip to content

Instantly share code, notes, and snippets.

@gwgundersen
Created July 8, 2014 22:48
Show Gist options
  • Select an option

  • Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.

Select an option

Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.

Revisions

  1. gwgundersen created this gist Jul 8, 2014.
    67 changes: 67 additions & 0 deletions fetch.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    from collections import namedtuple
    import re
    import pdb

    # http://tools.ietf.org/html/rfc3986#section-3.3
    """
    >> c = request.urlparse("http://gregorygundersen.com")
    >> c
    ParseResult(scheme='http', netloc='gregorygundersen.com', path='', params='', query='', fragment='')
    """



    def urlparse(url):
    pdb.set_trace()
    url = url.lower()

    # Check for protocol
    temp = url.split('://', 1)
    if len(temp) > 1:
    scheme = temp[0]
    remainder = temp[1]
    else:
    # If there's no protocol
    scheme = ''
    remainder = temp[0]

    # Check for netloc, i.e. the domain
    temp = remainder.split('/', 3)
    if len(temp) > 1:
    netloc = temp[0]
    remainder = temp[1]
    else:
    netloc = temp[0]
    remainder = ''

    # Check for path
    if len(remainder):
    temp = remainder.split('?')
    path = temp[0]
    if len(temp) > 1:
    query = temp[1]
    else:
    query = ''
    else:
    path = ''
    query = ''

    #ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment')
    #components = ParseResult(components[0], components[1]), components[2]), components[3]), components[4]), components[5])
    #components
    return (scheme, netloc, path, query)


    def urlparse2(url):
    url = url.lower()
    components = url_parse_strings(url)
    ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment')
    return ParseResult(components[0], components[1], components[2], components[3], components[4], components[5])


    def url_parse_strings(url):
    return url.replace(':', '|').replace('//', '|').replace('/', '|').replace('www.', '|').replace('.', '|').split('|')


    def url_parse_re(url):
    return re.split(':|\.|//|/|\?', url)