datamafia · March 24, 2015 01:03
diff --git a/url_validation.py b/url_validation.py
 """Check the validity of URL for maximum compliance, return "ready to
 use" url address (with http:// as needed) on success

 Abstract: Be dry. URL verification is a persistent need, this iteration
    has been very handy for website use, such as profile creation and seems
    to have a near total compliance with the changing URL schemas.

 No code is prefect, olease let me know if you find error or have a proposed
    patch.

 Credit and resources:
    https://mathiasbynens.be/demo/url-regex
    https://gist.github.com/dperini/729294
    https://gist.github.com/dperini/729294#comment-1296121
 """

 import re
 from urlparse import urlparse

 def check_url(url):
    """Check for valid url, append http:// as needed
    Args:
            url : string, required
    Returns: tuple(bool for success, url*)

    * url will return with whitespace stripped in all cases.
    * url will have trailing slash added when appropriate
    Respects:
        fragments ...com/param#stuf
        query ...com/param/?foo=bar&bar=foo
    """
    # kill whitespace
    url = url.replace(' ', '')
    if len(url) < 1:
        return(False, url)
    _url = urlparse(url)
    # urlparse returns (scheme, netloc, path, params, query, fragment)
    segment_2 = _url[2] # cast allowing modification
    if len(_url[1]) < 1 and len(_url[2]) < 1:
        return (False, url)
    # add trailing slash if necessary for hygene
    if not segment_2.endswith('/'):
        segment_2 = segment_2 + '/'
    # begin URL rebuild
    if _url[1]:
        segment_2 = _url[1]+segment_2
    # add http if not present. http will redirect to https as needed via server
    if not _url[0]:
        new_url = 'http://'+segment_2
    else:
        new_url = _url[0]+'://' + segment_2 # keeps scheme, must add ://
    # note : non-http protocols (FTP, etc) not supported
    # handle query segment
    if _url[4]:
        new_url = new_url+'?'+_url[4]
    # handle fragment segment
    if _url[5]:
        new_url = new_url + '#' + _url[5]
    url_regex = re.compile(
        u"^"
        # protocol identifier
        r"(?:(?:https?)://)"
        # user:pass authentication
        r"(?:\S+(?::\S*)?@)?"
        r"(?:"
        # IP address exclusion
        # private & local networks
        r"(?!(?:10|127)(?:\.\d{1,3}){3})"
        r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
        r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
        # IP address dotted notation octets
        # excludes loopback network 0.0.0.0
        # excludes reserved space >= 224.0.0.0
        # excludes network & broadcast addresses
        # (first & last IP address of each class)
        r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
        r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
        r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
        u"|"
        # host name
        r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
        # domain name
        r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
        # TLD identifier
        r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
        u")"
        # port number
        r"(?::\d{2,5})?"
        # resource path
        r"(?:/\S*)?"
        u"$"
        , re.UNICODE)
    if not url_regex.match(new_url):
        return (False, url)
    return (True, new_url)
diff --git a/url_validation_test.py b/url_validation_test.py
 """ Tests for url_validation.py script"""

 from __future__ import print_function
 from url_validation import check_url

 urls = [ # (Test_Data, Expected Result, expected bool)
    ('www.site2.com', 'http://www.site2.com/', True),
    ('www.site3.com/#', 'http://www.site3.com/', True),
    ('www.site4.com#', 'http://www.site4.com/', True),
    ('www.site5.com/#one', 'http://www.site5.com/#one', True),
    ('www.site6.com/#one#two', 'http://www.site6.com/#one#two', True),
    ('site.com', 'http://site.com/', True),
    ('http://place.com', 'http://place.com/', True),
    ('sub.domain.com', 'http://sub.domain.com/', True),
    ('www.sub1.domain.com', 'http://www.sub1.domain.com/', True),
    ('http://www.sub2.domain.com', 'http://www.sub2.domain.com/', True),
    ('www.site2.com/?param=stuff&other=stuff',
        'http://www.site2.com/?param=stuff&other=stuff', True),
    ('site.com?param=stuff&other=stuff',
        'http://site.com/?param=stuff&other=stuff', True),
    ('http://place.com?param=stuff&other=stuff',
        'http://place.com/?param=stuff&other=stuff', True),
    ('sub.domain.com?param=stuff&other=stuff',
        'http://sub.domain.com/?param=stuff&other=stuff', True),
    ('www.sub1.domain.com/?param=stuff&other=stuff',
        'http://www.sub1.domain.com/?param=stuff&other=stuff', True),
    ('http://www.sub2.domain.com/?param=stuff&other=stuff',
        'http://www.sub2.domain.com/?param=stuff&other=stuff', True),
    (' site.  ', 'site.', False), # whitespace removed test
    ('.com', '.com', False),
    ('somesite.co.uk', 'http://somesite.co.uk/', True), # .xx.yy tld
    ('somesite.ninja', 'http://somesite.ninja/', True), # weird/new tld
    ('military.mil', 'http://military.mil/', True), # mil domain
    ('http://.com', 'http://.com', False),
    (' ', '', False), # whitespace removed test
    ('http:/.com', 'http:/.com', False),
 ]

 for url in urls:
    bool_result, url_result = check_url(url[0])
    assert bool_result == url[2], 'Boolean result fail. %s expected when ' \
    'checking %s. %s returned' % (url[2], url[0], bool_result)
    assert url_result == url[1], 'URL result fail. When checking "%s", ' \
    '"%s" was returned but "%s" was expected.' % (url[0], url_result, url[1])

 print('#'*11)
 print('Test Passed')
 print('#'*11)
	"""Check the validity of URL for maximum compliance, return "ready to
	use" url address (with http:// as needed) on success

	Abstract: Be dry. URL verification is a persistent need, this iteration
	has been very handy for website use, such as profile creation and seems
	to have a near total compliance with the changing URL schemas.

	No code is prefect, olease let me know if you find error or have a proposed
	patch.

	Credit and resources:
	https://mathiasbynens.be/demo/url-regex
	https://gist.github.com/dperini/729294
	https://gist.github.com/dperini/729294#comment-1296121
	"""

	import re
	from urlparse import urlparse

	def check_url(url):
	"""Check for valid url, append http:// as needed
	Args:
	url : string, required
	Returns: tuple(bool for success, url*)

	* url will return with whitespace stripped in all cases.
	* url will have trailing slash added when appropriate
	Respects:
	fragments ...com/param#stuf
	query ...com/param/?foo=bar&bar=foo
	"""
	# kill whitespace
	url = url.replace(' ', '')
	if len(url) < 1:
	return(False, url)
	_url = urlparse(url)
	# urlparse returns (scheme, netloc, path, params, query, fragment)
	segment_2 = _url[2] # cast allowing modification
	if len(_url[1]) < 1 and len(_url[2]) < 1:
	return (False, url)
	# add trailing slash if necessary for hygene
	if not segment_2.endswith('/'):
	segment_2 = segment_2 + '/'
	# begin URL rebuild
	if _url[1]:
	segment_2 = _url[1]+segment_2
	# add http if not present. http will redirect to https as needed via server
	if not _url[0]:
	new_url = 'http://'+segment_2
	else:
	new_url = _url[0]+'://' + segment_2 # keeps scheme, must add ://
	# note : non-http protocols (FTP, etc) not supported
	# handle query segment
	if _url[4]:
	new_url = new_url+'?'+_url[4]
	# handle fragment segment
	if _url[5]:
	new_url = new_url + '#' + _url[5]
	url_regex = re.compile(
	u"^"
	# protocol identifier
	r"(?:(?:https?)://)"
	# user:pass authentication
	r"(?:\S+(?::\S*)?@)?"
	r"(?:"
	# IP address exclusion
	# private & local networks
	r"(?!(?:10\|127)(?:\.\d{1,3}){3})"
	r"(?!(?:169\.254\|192\.168)(?:\.\d{1,3}){2})"
	r"(?!172\.(?:1[6-9]\|2\d\|3[0-1])(?:\.\d{1,3}){2})"
	# IP address dotted notation octets
	# excludes loopback network 0.0.0.0
	# excludes reserved space >= 224.0.0.0
	# excludes network & broadcast addresses
	# (first & last IP address of each class)
	r"(?:[1-9]\d?\|1\d\d\|2[01]\d\|22[0-3])"
	r"(?:\.(?:1?\d{1,2}\|2[0-4]\d\|25[0-5])){2}"
	r"(?:\.(?:[1-9]\d?\|1\d\d\|2[0-4]\d\|25[0-4]))"
	u"\|"
	# host name
	r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
	# domain name
	r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)[a-z\u00a1-\uffff0-9]+)"
	# TLD identifier
	r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
	u")"
	# port number
	r"(?::\d{2,5})?"
	# resource path
	r"(?:/\S*)?"
	u"$"
	, re.UNICODE)
	if not url_regex.match(new_url):
	return (False, url)
	return (True, new_url)
	""" Tests for url_validation.py script"""

	from __future__ import print_function
	from url_validation import check_url

	urls = [ # (Test_Data, Expected Result, expected bool)
	('www.site2.com', 'http://www.site2.com/', True),
	('www.site3.com/#', 'http://www.site3.com/', True),
	('www.site4.com#', 'http://www.site4.com/', True),
	('www.site5.com/#one', 'http://www.site5.com/#one', True),
	('www.site6.com/#one#two', 'http://www.site6.com/#one#two', True),
	('site.com', 'http://site.com/', True),
	('http://place.com', 'http://place.com/', True),
	('sub.domain.com', 'http://sub.domain.com/', True),
	('www.sub1.domain.com', 'http://www.sub1.domain.com/', True),
	('http://www.sub2.domain.com', 'http://www.sub2.domain.com/', True),
	('www.site2.com/?param=stuff&other=stuff',
	'http://www.site2.com/?param=stuff&other=stuff', True),
	('site.com?param=stuff&other=stuff',
	'http://site.com/?param=stuff&other=stuff', True),
	('http://place.com?param=stuff&other=stuff',
	'http://place.com/?param=stuff&other=stuff', True),
	('sub.domain.com?param=stuff&other=stuff',
	'http://sub.domain.com/?param=stuff&other=stuff', True),
	('www.sub1.domain.com/?param=stuff&other=stuff',
	'http://www.sub1.domain.com/?param=stuff&other=stuff', True),
	('http://www.sub2.domain.com/?param=stuff&other=stuff',
	'http://www.sub2.domain.com/?param=stuff&other=stuff', True),
	(' site. ', 'site.', False), # whitespace removed test
	('.com', '.com', False),
	('somesite.co.uk', 'http://somesite.co.uk/', True), # .xx.yy tld
	('somesite.ninja', 'http://somesite.ninja/', True), # weird/new tld
	('military.mil', 'http://military.mil/', True), # mil domain
	('http://.com', 'http://.com', False),
	(' ', '', False), # whitespace removed test
	('http:/.com', 'http:/.com', False),
	]

	for url in urls:
	bool_result, url_result = check_url(url[0])
	assert bool_result == url[2], 'Boolean result fail. %s expected when ' \
	'checking %s. %s returned' % (url[2], url[0], bool_result)
	assert url_result == url[1], 'URL result fail. When checking "%s", ' \
	'"%s" was returned but "%s" was expected.' % (url[0], url_result, url[1])

	print('#'*11)
	print('Test Passed')
	print('#'*11)