Created
December 1, 2009 05:29
-
-
Save mnot/246089 to your computer and use it in GitHub Desktop.
urlnorm.py: URL normalisation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
urlnorm.py - URL normalisation routines | |
urlnorm normalises a URL by; | |
* lowercasing the scheme and hostname | |
* taking out default port if present (e.g., http://www.foo.com:80/) | |
* collapsing the path (./, ../, etc) | |
* removing the last character in the hostname if it is '.' | |
* unquoting any %-escaped characters | |
Available functions: | |
norms - given a URL (string), returns a normalised URL | |
norm - given a URL tuple, returns a normalised tuple | |
test - test suite | |
CHANGES: | |
0.92 - unknown schemes now pass the port through silently | |
0.91 - general cleanup | |
- changed dictionaries to lists where appropriate | |
- more fine-grained authority parsing and normalisation | |
""" | |
__license__ = """ | |
Copyright (c) 1999-2002 Mark Nottingham <[email protected]> | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
__version__ = "0.93" | |
from urlparse import urlparse, urlunparse | |
from urllib import unquote | |
from string import lower | |
import re | |
_collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') | |
_server_authority = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$') | |
_default_port = { 'http': '80', | |
'https': '443', | |
'gopher': '70', | |
'news': '119', | |
'snews': '563', | |
'nntp': '119', | |
'snntp': '563', | |
'ftp': '21', | |
'telnet': '23', | |
'prospero': '191', | |
} | |
_relative_schemes = [ 'http', | |
'https', | |
'news', | |
'snews', | |
'nntp', | |
'snntp', | |
'ftp', | |
'file', | |
'' | |
] | |
_server_authority_schemes = [ 'http', | |
'https', | |
'news', | |
'snews', | |
'ftp', | |
] | |
def norms(urlstring): | |
"""given a string URL, return its normalised form""" | |
return urlunparse(norm(urlparse(urlstring))) | |
def norm(urltuple): | |
"""given a six-tuple URL, return its normalised form""" | |
(scheme, authority, path, parameters, query, fragment) = urltuple | |
scheme = lower(scheme) | |
if authority: | |
userinfo, host, port = _server_authority.match(authority).groups() | |
if host[-1] == '.': | |
host = host[:-1] | |
authority = lower(host) | |
if userinfo: | |
authority = "%s@%s" % (userinfo, authority) | |
if port and port != _default_port.get(scheme, None): | |
authority = "%s:%s" % (authority, port) | |
if scheme in _relative_schemes: | |
last_path = path | |
while 1: | |
path = _collapse.sub('/', path, 1) | |
if last_path == path: | |
break | |
last_path = path | |
path = unquote(path) | |
return (scheme, authority, path, parameters, query, fragment) | |
def test(): | |
""" test suite; some taken from RFC1808. """ | |
tests = { | |
'/foo/bar/.': '/foo/bar/', | |
'/foo/bar/./': '/foo/bar/', | |
'/foo/bar/..': '/foo/', | |
'/foo/bar/../': '/foo/', | |
'/foo/bar/../baz': '/foo/baz', | |
'/foo/bar/../..': '/', | |
'/foo/bar/../../': '/', | |
'/foo/bar/../../baz': '/baz', | |
'/foo/bar/../../../baz': '/../baz', | |
'/foo/bar/../../../../baz': '/baz', | |
'/./foo': '/foo', | |
'/../foo': '/../foo', | |
'/foo.': '/foo.', | |
'/.foo': '/.foo', | |
'/foo..': '/foo..', | |
'/..foo': '/..foo', | |
'/./../foo': '/../foo', | |
'/./foo/.': '/foo/', | |
'/foo/./bar': '/foo/bar', | |
'/foo/../bar': '/bar', | |
'/foo//': '/foo/', | |
'/foo///bar//': '/foo/bar/', | |
'http://www.foo.com:80/foo': 'http://www.foo.com/foo', | |
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', | |
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', | |
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', | |
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', | |
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', | |
'ftp://user:[email protected]/foo/bar': 'ftp://user:[email protected]/foo/bar', | |
'http://USER:[email protected]/foo/bar': 'http://USER:[email protected]/foo/bar', | |
'http://www.example.com./': 'http://www.example.com/', | |
'-': '-', | |
} | |
n_correct, n_fail = 0, 0 | |
test_keys = tests.keys() | |
test_keys.sort() | |
for i in test_keys: | |
print 'ORIGINAL:', i | |
cleaned = norms(i) | |
answer = tests[i] | |
print 'CLEANED: ', cleaned | |
print 'CORRECT: ', answer | |
if cleaned != answer: | |
print "*** TEST FAILED" | |
n_fail = n_fail + 1 | |
else: | |
n_correct = n_correct + 1 | |
print "TOTAL CORRECT:", n_correct | |
print "TOTAL FAILURE:", n_fail | |
if __name__ == '__main__': | |
test() |
For those interested, I have forked and improved this Gist (Python 3, RFC 3986 compliance, Unittest framework, all the above corrections) here:
https://gist.github.com/maggyero/9bc1382b74b0eaf67bb020669c01b234
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you Mark, I think your
norm
function should go into theurllib
Python standard library.A few remarks on your code though:
The
_server_authority_schemes
variable is not used.The
_server_authority
regex does not allow empty port components with their ":" delimiters while RFC 3986 does (and recommends to normalize them by removing the ":" delimiter):To correct this, the line:
should be changed to this line (note the change of the last
+
quantifier to the*
quantifier):Also you could add this line to the
test
function:The
norm
function does not quote the path segment, so thenorms
function returns:instead of:
To correct this, the line in the
norm
function:should be appended with this line:
Also you could add these lines to the
test
function (the second line to check percent-encoding normalization of unreserved characters):The
_collapse
regular expression incorrectly removes consecutive '/' delimiters in the path component, so thenorms
function returns:instead of:
To correct this, a line calling the
remove_dot_segments
function specified in RFC 3986:should replace these lines:
and this line should be removed:
The
norm
function doesn't apply scheme-based normalization, as thenorms
function returns:instead of (note the trailing slash):
To correct this, the line in the
norm
function:should be prepended with these lines:
Also you could add this line to the
test
function:A Python 3 version of the code would be great.
I noticed another issue, but related to the
urllib.parse
module (and yournorms
function cannot do anything about it). The Python library documentation of the urllib.parse.urlunparse and urllib.parse.urlunsplit functions states:So with the http://example.com/? URI:
But RFC 3986 states the exact opposite:
Consequently, both
urllib.parse.urlparse
orurllib.parse.urlsplit
lose the "delimiter + empty component" information of the URI string, so they report false equivalent URIs:So you could add these lines to your tests (the last two left commented for the moment as
urllib
is broken):