-
-
Save mnot/246089 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
""" | |
urlnorm.py - URL normalisation routines | |
urlnorm normalises a URL by; | |
* lowercasing the scheme and hostname | |
* taking out default port if present (e.g., http://www.foo.com:80/) | |
* collapsing the path (./, ../, etc) | |
* removing the last character in the hostname if it is '.' | |
* unquoting any %-escaped characters | |
Available functions: | |
norms - given a URL (string), returns a normalised URL | |
norm - given a URL tuple, returns a normalised tuple | |
test - test suite | |
CHANGES: | |
0.92 - unknown schemes now pass the port through silently | |
0.91 - general cleanup | |
- changed dictionaries to lists where appropriate | |
- more fine-grained authority parsing and normalisation | |
""" | |
__license__ = """ | |
Copyright (c) 1999-2002 Mark Nottingham <[email protected]> | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
__version__ = "0.93" | |
from urlparse import urlparse, urlunparse | |
from urllib import unquote | |
from string import lower | |
import re | |
_collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') | |
_server_authority = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$') | |
_default_port = { 'http': '80', | |
'https': '443', | |
'gopher': '70', | |
'news': '119', | |
'snews': '563', | |
'nntp': '119', | |
'snntp': '563', | |
'ftp': '21', | |
'telnet': '23', | |
'prospero': '191', | |
} | |
_relative_schemes = [ 'http', | |
'https', | |
'news', | |
'snews', | |
'nntp', | |
'snntp', | |
'ftp', | |
'file', | |
'' | |
] | |
_server_authority_schemes = [ 'http', | |
'https', | |
'news', | |
'snews', | |
'ftp', | |
] | |
def norms(urlstring): | |
"""given a string URL, return its normalised form""" | |
return urlunparse(norm(urlparse(urlstring))) | |
def norm(urltuple): | |
"""given a six-tuple URL, return its normalised form""" | |
(scheme, authority, path, parameters, query, fragment) = urltuple | |
scheme = lower(scheme) | |
if authority: | |
userinfo, host, port = _server_authority.match(authority).groups() | |
if host[-1] == '.': | |
host = host[:-1] | |
authority = lower(host) | |
if userinfo: | |
authority = "%s@%s" % (userinfo, authority) | |
if port and port != _default_port.get(scheme, None): | |
authority = "%s:%s" % (authority, port) | |
if scheme in _relative_schemes: | |
last_path = path | |
while 1: | |
path = _collapse.sub('/', path, 1) | |
if last_path == path: | |
break | |
last_path = path | |
path = unquote(path) | |
return (scheme, authority, path, parameters, query, fragment) | |
def test(): | |
""" test suite; some taken from RFC1808. """ | |
tests = { | |
'/foo/bar/.': '/foo/bar/', | |
'/foo/bar/./': '/foo/bar/', | |
'/foo/bar/..': '/foo/', | |
'/foo/bar/../': '/foo/', | |
'/foo/bar/../baz': '/foo/baz', | |
'/foo/bar/../..': '/', | |
'/foo/bar/../../': '/', | |
'/foo/bar/../../baz': '/baz', | |
'/foo/bar/../../../baz': '/../baz', | |
'/foo/bar/../../../../baz': '/baz', | |
'/./foo': '/foo', | |
'/../foo': '/../foo', | |
'/foo.': '/foo.', | |
'/.foo': '/.foo', | |
'/foo..': '/foo..', | |
'/..foo': '/..foo', | |
'/./../foo': '/../foo', | |
'/./foo/.': '/foo/', | |
'/foo/./bar': '/foo/bar', | |
'/foo/../bar': '/bar', | |
'/foo//': '/foo/', | |
'/foo///bar//': '/foo/bar/', | |
'http://www.foo.com:80/foo': 'http://www.foo.com/foo', | |
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', | |
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', | |
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', | |
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', | |
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', | |
'ftp://user:[email protected]/foo/bar': 'ftp://user:[email protected]/foo/bar', | |
'http://USER:[email protected]/foo/bar': 'http://USER:[email protected]/foo/bar', | |
'http://www.example.com./': 'http://www.example.com/', | |
'-': '-', | |
} | |
n_correct, n_fail = 0, 0 | |
test_keys = tests.keys() | |
test_keys.sort() | |
for i in test_keys: | |
print 'ORIGINAL:', i | |
cleaned = norms(i) | |
answer = tests[i] | |
print 'CLEANED: ', cleaned | |
print 'CORRECT: ', answer | |
if cleaned != answer: | |
print "*** TEST FAILED" | |
n_fail = n_fail + 1 | |
else: | |
n_correct = n_correct + 1 | |
print "TOTAL CORRECT:", n_correct | |
print "TOTAL FAILURE:", n_fail | |
if __name__ == '__main__': | |
test() |
Thank you Mark, I think your norm
function should go into the urllib
Python standard library.
A few remarks on your code though:
-
The
_server_authority_schemes
variable is not used. -
The
_server_authority
regex does not allow empty port components with their ":" delimiters while RFC 3986 does (and recommends to normalize them by removing the ":" delimiter):>>> norms('http://example.com:/') Traceback (most recent call last): [...] userinfo, host, port = _authority.match(authority).groups() AttributeError: 'NoneType' object has no attribute 'groups'
To correct this, the line:
_server_authority = re.compile("^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$")
should be changed to this line (note the change of the last
+
quantifier to the*
quantifier):_server_authority = re.compile("^(?:([^\@]+)\@)?([^\:]+)(?:\:(.*))?$")
Also you could add this line to the
test
function:'http://example.com:/': 'http://example.com/',
-
The
norm
function does not quote the path segment, so thenorms
function returns:>>> norms('http://example.com/ foo') 'http://example.com/ foo',
instead of:
>>> norms('http://example.com/ foo') 'http://example.com/%20foo',
To correct this, the line in the
norm
function:path = urllib.parse.unquote(path)
should be appended with this line:
path = urllib.parse.quote(path)
Also you could add these lines to the
test
function (the second line to check percent-encoding normalization of unreserved characters):'http://example.com/ foo': 'http://example.com/%20foo', 'http://example.com/fo%6F': 'http://example.com/foo',
-
The
_collapse
regular expression incorrectly removes consecutive '/' delimiters in the path component, so thenorms
function returns:>>> norms('/foo////bar') '/foo/bar'
instead of:
>>> norms('/foo////bar') '/foo////bar'
To correct this, a line calling the
remove_dot_segments
function specified in RFC 3986:path = remove_dot_segments(path)
should replace these lines:
last_path = path while 1: path = _collapse.sub('/', path, 1) if last_path == path: break last_path = path
and this line should be removed:
_collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)')
-
The
norm
function doesn't apply scheme-based normalization, as thenorms
function returns:>>> norms('http://example.com') 'http://example.com'
instead of (note the trailing slash):
>>> norms('http://example.com') 'http://example.com/'
To correct this, the line in the
norm
function:path = unquote(path)
should be prepended with these lines:
if authority and not path: path = "/"
Also you could add this line to the
test
function:'http://www.foo.com': 'http://www.foo.com/',
-
A Python 3 version of the code would be great.
I noticed another issue, but related to the urllib.parse
module (and your norms
function cannot do anything about it). The Python library documentation of the urllib.parse.urlunparse and urllib.parse.urlunsplit functions states:
This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an empty query; the RFC states that these are equivalent).
So with the http://example.com/? URI:
>>> import urllib.parse
>>> urllib.parse.urlunparse(urllib.parse.urlparse("http://example.com/?"))
'http://example.com/'
>>> urllib.parse.urlunsplit(urllib.parse.urlsplit("http://example.com/?"))
'http://example.com/'
But RFC 3986 states the exact opposite:
Normalization should not remove delimiters when their associated component is empty unless licensed to do so by the scheme specification. For example, the URI "http://example.com/?" cannot be assumed to be equivalent to any of the examples above. Likewise, the presence or absence of delimiters within a userinfo subcomponent is usually significant to its interpretation. The fragment component is not subject to any scheme-based normalization; thus, two URIs that differ only by the suffix "#" are considered different regardless of the scheme.
Consequently, both urllib.parse.urlparse
or urllib.parse.urlsplit
lose the "delimiter + empty component" information of the URI string, so they report false equivalent URIs:
>>> import urllib.parse
>>> urllib.parse.urlparse("http://example.com/?") == urllib.parse.urlparse("http://example.com/")
True
>>> urllib.parse.urlsplit("http://example.com/?") == urllib.parse.urlsplit("http://example.com/")
True
So you could add these lines to your tests (the last two left commented for the moment as urllib
is broken):
'http://@example.com/': 'http://@example.com/',
# 'http://example.com/?': 'http://example.com/?',
# 'http://example.com/#': 'http://example.com/#',
For those interested, I have forked and improved this Gist (Python 3, RFC 3986 compliance, Unittest framework, all the above corrections) here:
https://gist.github.com/maggyero/9bc1382b74b0eaf67bb020669c01b234
cool~