-
-
Save mnot/246089 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| """ | |
| urlnorm.py - URL normalisation routines | |
| urlnorm normalises a URL by; | |
| * lowercasing the scheme and hostname | |
| * taking out default port if present (e.g., http://www.foo.com:80/) | |
| * collapsing the path (./, ../, etc) | |
| * removing the last character in the hostname if it is '.' | |
| * unquoting any %-escaped characters | |
| Available functions: | |
| norms - given a URL (string), returns a normalised URL | |
| norm - given a URL tuple, returns a normalised tuple | |
| test - test suite | |
| CHANGES: | |
| 0.92 - unknown schemes now pass the port through silently | |
| 0.91 - general cleanup | |
| - changed dictionaries to lists where appropriate | |
| - more fine-grained authority parsing and normalisation | |
| """ | |
| __license__ = """ | |
| Copyright (c) 1999-2002 Mark Nottingham <[email protected]> | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. | |
| """ | |
| __version__ = "0.93" | |
| from urlparse import urlparse, urlunparse | |
| from urllib import unquote | |
| from string import lower | |
| import re | |
| _collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') | |
| _server_authority = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$') | |
| _default_port = { 'http': '80', | |
| 'https': '443', | |
| 'gopher': '70', | |
| 'news': '119', | |
| 'snews': '563', | |
| 'nntp': '119', | |
| 'snntp': '563', | |
| 'ftp': '21', | |
| 'telnet': '23', | |
| 'prospero': '191', | |
| } | |
| _relative_schemes = [ 'http', | |
| 'https', | |
| 'news', | |
| 'snews', | |
| 'nntp', | |
| 'snntp', | |
| 'ftp', | |
| 'file', | |
| '' | |
| ] | |
| _server_authority_schemes = [ 'http', | |
| 'https', | |
| 'news', | |
| 'snews', | |
| 'ftp', | |
| ] | |
| def norms(urlstring): | |
| """given a string URL, return its normalised form""" | |
| return urlunparse(norm(urlparse(urlstring))) | |
| def norm(urltuple): | |
| """given a six-tuple URL, return its normalised form""" | |
| (scheme, authority, path, parameters, query, fragment) = urltuple | |
| scheme = lower(scheme) | |
| if authority: | |
| userinfo, host, port = _server_authority.match(authority).groups() | |
| if host[-1] == '.': | |
| host = host[:-1] | |
| authority = lower(host) | |
| if userinfo: | |
| authority = "%s@%s" % (userinfo, authority) | |
| if port and port != _default_port.get(scheme, None): | |
| authority = "%s:%s" % (authority, port) | |
| if scheme in _relative_schemes: | |
| last_path = path | |
| while 1: | |
| path = _collapse.sub('/', path, 1) | |
| if last_path == path: | |
| break | |
| last_path = path | |
| path = unquote(path) | |
| return (scheme, authority, path, parameters, query, fragment) | |
| def test(): | |
| """ test suite; some taken from RFC1808. """ | |
| tests = { | |
| '/foo/bar/.': '/foo/bar/', | |
| '/foo/bar/./': '/foo/bar/', | |
| '/foo/bar/..': '/foo/', | |
| '/foo/bar/../': '/foo/', | |
| '/foo/bar/../baz': '/foo/baz', | |
| '/foo/bar/../..': '/', | |
| '/foo/bar/../../': '/', | |
| '/foo/bar/../../baz': '/baz', | |
| '/foo/bar/../../../baz': '/../baz', | |
| '/foo/bar/../../../../baz': '/baz', | |
| '/./foo': '/foo', | |
| '/../foo': '/../foo', | |
| '/foo.': '/foo.', | |
| '/.foo': '/.foo', | |
| '/foo..': '/foo..', | |
| '/..foo': '/..foo', | |
| '/./../foo': '/../foo', | |
| '/./foo/.': '/foo/', | |
| '/foo/./bar': '/foo/bar', | |
| '/foo/../bar': '/bar', | |
| '/foo//': '/foo/', | |
| '/foo///bar//': '/foo/bar/', | |
| 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', | |
| 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', | |
| 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', | |
| 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', | |
| 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', | |
| 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', | |
| 'ftp://user:[email protected]/foo/bar': 'ftp://user:[email protected]/foo/bar', | |
| 'http://USER:[email protected]/foo/bar': 'http://USER:[email protected]/foo/bar', | |
| 'http://www.example.com./': 'http://www.example.com/', | |
| '-': '-', | |
| } | |
| n_correct, n_fail = 0, 0 | |
| test_keys = tests.keys() | |
| test_keys.sort() | |
| for i in test_keys: | |
| print 'ORIGINAL:', i | |
| cleaned = norms(i) | |
| answer = tests[i] | |
| print 'CLEANED: ', cleaned | |
| print 'CORRECT: ', answer | |
| if cleaned != answer: | |
| print "*** TEST FAILED" | |
| n_fail = n_fail + 1 | |
| else: | |
| n_correct = n_correct + 1 | |
| print "TOTAL CORRECT:", n_correct | |
| print "TOTAL FAILURE:", n_fail | |
| if __name__ == '__main__': | |
| test() |
Thank you Mark, I think your norm function should go into the urllib Python standard library.
A few remarks on your code though:
-
The
_server_authority_schemesvariable is not used. -
The
_server_authorityregex does not allow empty port components with their ":" delimiters while RFC 3986 does (and recommends to normalize them by removing the ":" delimiter):>>> norms('http://example.com:/') Traceback (most recent call last): [...] userinfo, host, port = _authority.match(authority).groups() AttributeError: 'NoneType' object has no attribute 'groups'To correct this, the line:
_server_authority = re.compile("^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$")should be changed to this line (note the change of the last
+quantifier to the*quantifier):_server_authority = re.compile("^(?:([^\@]+)\@)?([^\:]+)(?:\:(.*))?$")Also you could add this line to the
testfunction:'http://example.com:/': 'http://example.com/', -
The
normfunction does not quote the path segment, so thenormsfunction returns:>>> norms('http://example.com/ foo') 'http://example.com/ foo',instead of:
>>> norms('http://example.com/ foo') 'http://example.com/%20foo',To correct this, the line in the
normfunction:path = urllib.parse.unquote(path)should be appended with this line:
path = urllib.parse.quote(path)Also you could add these lines to the
testfunction (the second line to check percent-encoding normalization of unreserved characters):'http://example.com/ foo': 'http://example.com/%20foo', 'http://example.com/fo%6F': 'http://example.com/foo', -
The
_collapseregular expression incorrectly removes consecutive '/' delimiters in the path component, so thenormsfunction returns:>>> norms('/foo////bar') '/foo/bar'instead of:
>>> norms('/foo////bar') '/foo////bar'To correct this, a line calling the
remove_dot_segmentsfunction specified in RFC 3986:path = remove_dot_segments(path)should replace these lines:
last_path = path while 1: path = _collapse.sub('/', path, 1) if last_path == path: break last_path = pathand this line should be removed:
_collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') -
The
normfunction doesn't apply scheme-based normalization, as thenormsfunction returns:>>> norms('http://example.com') 'http://example.com'instead of (note the trailing slash):
>>> norms('http://example.com') 'http://example.com/'To correct this, the line in the
normfunction:path = unquote(path)should be prepended with these lines:
if authority and not path: path = "/"Also you could add this line to the
testfunction:'http://www.foo.com': 'http://www.foo.com/', -
A Python 3 version of the code would be great.
I noticed another issue, but related to the urllib.parse module (and your norms function cannot do anything about it). The Python library documentation of the urllib.parse.urlunparse and urllib.parse.urlunsplit functions states:
This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an empty query; the RFC states that these are equivalent).
So with the http://example.com/? URI:
>>> import urllib.parse
>>> urllib.parse.urlunparse(urllib.parse.urlparse("http://example.com/?"))
'http://example.com/'
>>> urllib.parse.urlunsplit(urllib.parse.urlsplit("http://example.com/?"))
'http://example.com/'
But RFC 3986 states the exact opposite:
Normalization should not remove delimiters when their associated component is empty unless licensed to do so by the scheme specification. For example, the URI "http://example.com/?" cannot be assumed to be equivalent to any of the examples above. Likewise, the presence or absence of delimiters within a userinfo subcomponent is usually significant to its interpretation. The fragment component is not subject to any scheme-based normalization; thus, two URIs that differ only by the suffix "#" are considered different regardless of the scheme.
Consequently, both urllib.parse.urlparse or urllib.parse.urlsplit lose the "delimiter + empty component" information of the URI string, so they report false equivalent URIs:
>>> import urllib.parse
>>> urllib.parse.urlparse("http://example.com/?") == urllib.parse.urlparse("http://example.com/")
True
>>> urllib.parse.urlsplit("http://example.com/?") == urllib.parse.urlsplit("http://example.com/")
True
So you could add these lines to your tests (the last two left commented for the moment as urllib is broken):
'http://@example.com/': 'http://@example.com/',
# 'http://example.com/?': 'http://example.com/?',
# 'http://example.com/#': 'http://example.com/#',
For those interested, I have forked and improved this Gist (Python 3, RFC 3986 compliance, Unittest framework, all the above corrections) here:
https://gist.github.com/maggyero/9bc1382b74b0eaf67bb020669c01b234
cool~