Created
April 22, 2011 14:17
-
-
Save originell/936744 to your computer and use it in GitHub Desktop.
Trying to build a regex validating the given URIs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
Build URL Validation Regex. | |
See the according *\_urls list for a definition of what we | |
want to match and what not. | |
Note that according to RFC 2612 (HTTP 1.1, Section 3.1.2) there is | |
no such thing as a maximum URI length. Since servers should | |
be able to handle URIs of unbounded length. | |
However in practice (see http://www.boutell.com/newfaq/misc/urllength.html) | |
it seems that a good limit is 2,000 characters. | |
The DNS Protocol has the following limits, which affect | |
the domain-level, not on the complete URI (remember that GET parameters | |
and hashbangs are part of the URI): | |
* Maximum level of subdivions (read: subdomains) is 127 | |
* Each label may consist of up to 63 characters | |
* The full domain name may not exceed 253 characters | |
in it\'s external dotted-label respresentation. | |
* DNS names adhere to a subset of the ASCII charset. | |
Umlaut and other linguistically special characters get resolved | |
by punycode. Therefore the LHD-Rule applies (letters, digits, hyphen), | |
meaning only a-z, A-Z, 0-9 and the hyphen itself are legal characters. | |
Should we ever need to use pjunycode, I recommend implementing this | |
via a webservice API since Python has a pjunycode implementation integrated. | |
Furthermore we do not need to recognize 100% specific locations. | |
So dealing with filenames or ports should not necessary. | |
By assuring this, we can also avoid pointers to abusive images. | |
Attention: | |
This regular expression is build to be compatible to JavaScript 1.2 (ECMA-262). | |
To see the differences visit http://www.regular-expressions.info/javascript.html | |
and http://www.regular-expressions.info/python.html | |
""" | |
import re | |
# This is django's regex. Here for reference. | |
#regex = re.compile( | |
# r'^(?:http|ftp)s?://' # http:// or https:// | |
# r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... | |
# r'localhost|' #localhost... | |
# r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
# r'(?::\d+)?' # optional port | |
# r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
# | |
# Second try | |
# regex = re.compile( | |
# r'^(' | |
# r'((http)s?://)?' # match http:// or https:// - OPTIONAL | |
# r'(www\.)?' # match www. - OPTIONAL | |
# r')?' # the http/https/www part is completely... OPTIONAL | |
# r'[A-Z0-9][-A-Z0-9]{0,61}[A-Z0-9]\.' # domainname not allowed to start | |
# # with a hyphen or end with it. | |
# # max length: 63 chars | |
# r'[A-Z]{2,6}/?' # tld (longest I found was .museum) | |
# r'[A-Z0-9][-A-Z0-9]{0,61}[A-Z0-9]\.' | |
# r'$' | |
# , re.IGNORECASE) #|re.DEBUG | |
# Third try. Trying to understand django's domain regex :b | |
regex = re.compile( | |
r'^(' | |
r'((http)s?://)?' # match http:// or https:// - OPTIONAL | |
r'(www\.)?' # match www. - OPTIONAL | |
r')?' # the http/https/www part is completely... OPTIONAL | |
r'(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)' | |
r'(?:/?|[/?]\S+)$' | |
, re.IGNORECASE) #|re.DEBUG | |
# These are the URLs we want to match | |
match_urls = [ | |
'http://www.example.com/', | |
'https://www.example.com/', | |
'http://subdomain.example.com/', | |
'https://subdomain.example.com/', | |
'http://example.com/', | |
'https://example.com/', | |
'http://www.exa-mple.com/', | |
'https://www.exa-mple.com/', | |
'http://www.example.com', | |
'https://www.example.com', | |
'http://subdomain.example.com', | |
'https://subdomain.example.com', | |
'http://example.com', | |
'https://example.com', | |
'http://www.exa-mple.com', | |
'https://www.exa-mple.com', | |
'www.example.com/', | |
'www.example.com', | |
'subdomain.example.com/', | |
'subdomain.example.com', | |
'subdomain.sub2.example.com/', | |
'subdomain.sub2.example.com', | |
'http://sub-domain.example.com/', | |
'http://sub-domain.example.com', | |
'sub-domain.example.com/', | |
'sub-domain.example.com', | |
############### | |
'http://www.example.com/path/', | |
'https://www.example.com/path/', | |
'http://subdomain.example.com/path/', | |
'https://subdomain.example.com/path/', | |
'http://example.com/path/', | |
'https://example.com/path/', | |
'http://www.exa-mple.com/path/', | |
'https://www.exa-mple.com/path/', | |
'http://www.example.com/path', | |
'https://www.example.com/path', | |
'http://subdomain.example.com/path', | |
'https://subdomain.example.com/path', | |
'http://example.com/path', | |
'https://example.com/path', | |
'http://www.exa-mple.com/path', | |
'https://www.exa-mple.com/path', | |
'www.example.com/path/', | |
'www.example.com/path', | |
'subdomain.example.com/path/', | |
'subdomain.example.com/path', | |
'subdomain.sub2.example.com/path/', | |
'subdomain.sub2.example.com/path', | |
'http://sub-domain.example.com/path/', | |
'http://sub-domain.example.com/path', | |
'sub-domain.example.com/path/', | |
'sub-domain.example.com/path', | |
################ | |
'http://www.example.com/path/secondlevel/', | |
'https://www.example.com/path/secondlevel/', | |
'http://subdomain.example.com/path/secondlevel/', | |
'https://subdomain.example.com/path/secondlevel/', | |
'http://example.com/path/secondlevel/', | |
'https://example.com/path/secondlevel/', | |
'http://www.exa-mple.com/path/secondlevel/', | |
'https://www.exa-mple.com/path/secondlevel/', | |
'http://www.example.com/path/secondlevel', | |
'https://www.example.com/path/secondlevel', | |
'http://subdomain.example.com/path/secondlevel', | |
'https://subdomain.example.com/path/secondlevel', | |
'http://example.com/path/secondlevel', | |
'https://example.com/path/secondlevel', | |
'http://www.exa-mple.com/path/secondlevel', | |
'https://www.exa-mple.com/path/secondlevel', | |
'www.example.com/path/secondlevel/', | |
'www.example.com/path/secondlevel', | |
'subdomain.example.com/path/secondlevel/', | |
'subdomain.example.com/path/secondlevel', | |
'subdomain.sub2.example.com/path/secondlevel/', | |
'subdomain.sub2.example.com/path/secondlevel', | |
'http://sub-domain.example.com/path/secondlevel/', | |
'http://sub-domain.example.com/path/secondlevel', | |
'sub-domain.example.com/path/secondlevel/', | |
'sub-domain.example.com/path/secondlevel', | |
################ | |
'http://www.example.com/path/secondlevel/thirdlevel/', | |
'https://www.example.com/path/secondlevel/thirdlevel/', | |
'http://subdomain.example.com/path/secondlevel/thirdlevel/', | |
'https://subdomain.example.com/path/secondlevel/thirdlevel/', | |
'http://example.com/path/secondlevel/thirdlevel/', | |
'https://example.com/path/secondlevel/thirdlevel/', | |
'http://www.exa-mple.com/path/secondlevel/thirdlevel/', | |
'https://www.exa-mple.com/path/secondlevel/thirdlevel/', | |
'http://www.example.com/path/secondlevel/thirdlevel', | |
'https://www.example.com/path/secondlevel/thirdlevel', | |
'http://subdomain.example.com/path/secondlevel/thirdlevel', | |
'https://subdomain.example.com/path/secondlevel/thirdlevel', | |
'http://example.com/path/secondlevel/thirdlevel', | |
'https://example.com/path/secondlevel/thirdlevel', | |
'http://www.exa-mple.com/path/secondlevel/thirdlevel', | |
'https://www.exa-mple.com/path/secondlevel/thirdlevel', | |
'www.example.com/path/secondlevel/thirdlevel/', | |
'www.example.com/path/secondlevel/thirdlevel', | |
'subdomain.example.com/path/secondlevel/thirdlevel/', | |
'subdomain.example.com/path/secondlevel/thirdlevel', | |
'subdomain.sub2.example.com/path/secondlevel/thirdlevel/', | |
'subdomain.sub2.example.com/path/secondlevel/thirdlevel', | |
'http://sub-domain.example.com/path/secondlevel/thirdlevel/', | |
'http://sub-domain.example.com/path/secondlevel/thirdlevel', | |
'sub-domain.example.com/path/secondlevel/thirdlevel/', | |
'sub-domain.example.com/path/secondlevel/thirdlevel', | |
] | |
dont_match_urls = [ | |
'ftp://[email protected]/', | |
'ftp://[email protected]', | |
'ftp://[email protected]/path/', | |
'ftp://[email protected]/path', | |
################# | |
'http://127.0.0.1/', | |
'https://127.0.0.1/', | |
'http://127.0.0.1', | |
'https://127.0.0.1/', | |
################# | |
'http://example.com:80/', | |
'https://example.com:80/', | |
'http://example.com:80', | |
'https://example.com:80', | |
'http://example.com:80/path/', | |
'https://example.com:80/path/', | |
'http://example.com:80/path', | |
'https://example.com:80/path', | |
'www.example.com:80/', | |
'www.example.com:80', | |
'example.com:80/', | |
'example.com:80', | |
################## | |
'http://', | |
'https://', | |
'www.', | |
'example', | |
'.com', | |
'com', | |
'//', | |
################## | |
'//192.168.1.1/', | |
'http://localhost/', | |
################## | |
'http://-example.com/', | |
'http://example-.com/' | |
################## | |
'http://example.com/index.php', | |
'http://example.com/path/index.php', | |
'http://example.com/foobar.jpg', | |
'http://example.com/path/foobar.jpg', | |
################## | |
'http://example.com:foobar/', | |
] | |
false_matches = [] | |
for match_url in match_urls: | |
does_match = re.match(regex, match_url) | |
if does_match is None: | |
print match_url, does_match | |
false_matches.append(match_url) | |
for dont_match_url in dont_match_urls: | |
does_not_match = re.match(regex, dont_match_url) | |
if does_not_match is not None: | |
print dont_match_url, does_not_match | |
false_matches.append(dont_match_url) | |
if false_matches: | |
print "==========================================" | |
print "%i/%i urls failed" % (len(false_matches), len(match_urls)+len(dont_match_urls)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment