Created
July 8, 2014 22:48
-
-
Save gwgundersen/e6cd307ba2320c418a3f to your computer and use it in GitHub Desktop.
Revisions
-
gwgundersen created this gist
Jul 8, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,67 @@ from collections import namedtuple import re import pdb # http://tools.ietf.org/html/rfc3986#section-3.3 """ >> c = request.urlparse("http://gregorygundersen.com") >> c ParseResult(scheme='http', netloc='gregorygundersen.com', path='', params='', query='', fragment='') """ def urlparse(url): pdb.set_trace() url = url.lower() # Check for protocol temp = url.split('://', 1) if len(temp) > 1: scheme = temp[0] remainder = temp[1] else: # If there's no protocol scheme = '' remainder = temp[0] # Check for netloc, i.e. the domain temp = remainder.split('/', 3) if len(temp) > 1: netloc = temp[0] remainder = temp[1] else: netloc = temp[0] remainder = '' # Check for path if len(remainder): temp = remainder.split('?') path = temp[0] if len(temp) > 1: query = temp[1] else: query = '' else: path = '' query = '' #ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment') #components = ParseResult(components[0], components[1]), components[2]), components[3]), components[4]), components[5]) #components return (scheme, netloc, path, query) def urlparse2(url): url = url.lower() components = url_parse_strings(url) ParseResult = namedtuple('ParseResult', 'scheme netloc path params query fragment') return ParseResult(components[0], components[1], components[2], components[3], components[4], components[5]) def url_parse_strings(url): return url.replace(':', '|').replace('//', '|').replace('/', '|').replace('www.', '|').replace('.', '|').split('|') def url_parse_re(url): return re.split(':|\.|//|/|\?', url)