Skip to content

Instantly share code, notes, and snippets.

@datamafia
Created March 24, 2015 01:03
Show Gist options
  • Save datamafia/e2fb763fa28fd6dc7864 to your computer and use it in GitHub Desktop.
Save datamafia/e2fb763fa28fd6dc7864 to your computer and use it in GitHub Desktop.
Highly compliant Python URL validator
"""Check the validity of URL for maximum compliance, return "ready to
use" url address (with http:// as needed) on success
Abstract: Be dry. URL verification is a persistent need, this iteration
has been very handy for website use, such as profile creation and seems
to have a near total compliance with the changing URL schemas.
No code is prefect, olease let me know if you find error or have a proposed
patch.
Credit and resources:
https://mathiasbynens.be/demo/url-regex
https://gist.github.com/dperini/729294
https://gist.github.com/dperini/729294#comment-1296121
"""
import re
from urlparse import urlparse
def check_url(url):
"""Check for valid url, append http:// as needed
Args:
url : string, required
Returns: tuple(bool for success, url*)
* url will return with whitespace stripped in all cases.
* url will have trailing slash added when appropriate
Respects:
fragments ...com/param#stuf
query ...com/param/?foo=bar&bar=foo
"""
# kill whitespace
url = url.replace(' ', '')
if len(url) < 1:
return(False, url)
_url = urlparse(url)
# urlparse returns (scheme, netloc, path, params, query, fragment)
segment_2 = _url[2] # cast allowing modification
if len(_url[1]) < 1 and len(_url[2]) < 1:
return (False, url)
# add trailing slash if necessary for hygene
if not segment_2.endswith('/'):
segment_2 = segment_2 + '/'
# begin URL rebuild
if _url[1]:
segment_2 = _url[1]+segment_2
# add http if not present. http will redirect to https as needed via server
if not _url[0]:
new_url = 'http://'+segment_2
else:
new_url = _url[0]+'://' + segment_2 # keeps scheme, must add ://
# note : non-http protocols (FTP, etc) not supported
# handle query segment
if _url[4]:
new_url = new_url+'?'+_url[4]
# handle fragment segment
if _url[5]:
new_url = new_url + '#' + _url[5]
url_regex = re.compile(
u"^"
# protocol identifier
r"(?:(?:https?)://)"
# user:pass authentication
r"(?:\S+(?::\S*)?@)?"
r"(?:"
# IP address exclusion
# private & local networks
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
u"|"
# host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
u")"
# port number
r"(?::\d{2,5})?"
# resource path
r"(?:/\S*)?"
u"$"
, re.UNICODE)
if not url_regex.match(new_url):
return (False, url)
return (True, new_url)
""" Tests for url_validation.py script"""
from __future__ import print_function
from url_validation import check_url
urls = [ # (Test_Data, Expected Result, expected bool)
('www.site2.com', 'http://www.site2.com/', True),
('www.site3.com/#', 'http://www.site3.com/', True),
('www.site4.com#', 'http://www.site4.com/', True),
('www.site5.com/#one', 'http://www.site5.com/#one', True),
('www.site6.com/#one#two', 'http://www.site6.com/#one#two', True),
('site.com', 'http://site.com/', True),
('http://place.com', 'http://place.com/', True),
('sub.domain.com', 'http://sub.domain.com/', True),
('www.sub1.domain.com', 'http://www.sub1.domain.com/', True),
('http://www.sub2.domain.com', 'http://www.sub2.domain.com/', True),
('www.site2.com/?param=stuff&other=stuff',
'http://www.site2.com/?param=stuff&other=stuff', True),
('site.com?param=stuff&other=stuff',
'http://site.com/?param=stuff&other=stuff', True),
('http://place.com?param=stuff&other=stuff',
'http://place.com/?param=stuff&other=stuff', True),
('sub.domain.com?param=stuff&other=stuff',
'http://sub.domain.com/?param=stuff&other=stuff', True),
('www.sub1.domain.com/?param=stuff&other=stuff',
'http://www.sub1.domain.com/?param=stuff&other=stuff', True),
('http://www.sub2.domain.com/?param=stuff&other=stuff',
'http://www.sub2.domain.com/?param=stuff&other=stuff', True),
(' site. ', 'site.', False), # whitespace removed test
('.com', '.com', False),
('somesite.co.uk', 'http://somesite.co.uk/', True), # .xx.yy tld
('somesite.ninja', 'http://somesite.ninja/', True), # weird/new tld
('military.mil', 'http://military.mil/', True), # mil domain
('http://.com', 'http://.com', False),
(' ', '', False), # whitespace removed test
('http:/.com', 'http:/.com', False),
]
for url in urls:
bool_result, url_result = check_url(url[0])
assert bool_result == url[2], 'Boolean result fail. %s expected when ' \
'checking %s. %s returned' % (url[2], url[0], bool_result)
assert url_result == url[1], 'URL result fail. When checking "%s", ' \
'"%s" was returned but "%s" was expected.' % (url[0], url_result, url[1])
print('#'*11)
print('Test Passed')
print('#'*11)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment