pchc2005 · September 20, 2018 09:47 · doronhorwitz · Jan 6, 2020 · jkazimierczak · Apr 9, 2021
diff --git a/regex-weburl.py b/regex-weburl.py
 import re
 URL_REGEX = re.compile(
    u"^"
    # protocol identifier
    u"(?:(?:(?:https?|ftp):)?//)"
    # user:pass authentication
    u"(?:\S+(?::\S*)?@)?"
    u"(?:"
    # IP address exclusion
    # private & local networks
    u"(?!(?:10|127)(?:\.\d{1,3}){3})"
    u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    u"|"
    # host & domain names, may end with dot
    # can be replaced by a shortest alternative
    # u"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+"
    # u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
    # # domain name
    # u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
    u"(?:"
    u"(?:"
    u"[a-z0-9\u00a1-\uffff]"
    u"[a-z0-9\u00a1-\uffff_-]{0,62}"
    u")?"
    u"[a-z0-9\u00a1-\uffff]\."
    u")+"
    # TLD identifier name, may end with dot
    u"(?:[a-z\u00a1-\uffff]{2,}\.?)"
    u")"
    # port number (optional)
    u"(?::\d{2,5})?"
    # resource path (optional)
    u"(?:[/?#]\S*)?"
    u"$"
    , re.UNICODE | re.I
 )
	import re
	URL_REGEX = re.compile(
	u"^"
	# protocol identifier
	u"(?:(?:(?:https?\|ftp):)?//)"
	# user:pass authentication
	u"(?:\S+(?::\S*)?@)?"
	u"(?:"
	# IP address exclusion
	# private & local networks
	u"(?!(?:10\|127)(?:\.\d{1,3}){3})"
	u"(?!(?:169\.254\|192\.168)(?:\.\d{1,3}){2})"
	u"(?!172\.(?:1[6-9]\|2\d\|3[0-1])(?:\.\d{1,3}){2})"
	# IP address dotted notation octets
	# excludes loopback network 0.0.0.0
	# excludes reserved space >= 224.0.0.0
	# excludes network & broadcast addresses
	# (first & last IP address of each class)
	u"(?:[1-9]\d?\|1\d\d\|2[01]\d\|22[0-3])"
	u"(?:\.(?:1?\d{1,2}\|2[0-4]\d\|25[0-5])){2}"
	u"(?:\.(?:[1-9]\d?\|1\d\d\|2[0-4]\d\|25[0-4]))"
	u"\|"
	# host & domain names, may end with dot
	# can be replaced by a shortest alternative
	# u"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+"
	# u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
	# # domain name
	# u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)[a-z\u00a1-\uffff0-9]+)"
	u"(?:"
	u"(?:"
	u"[a-z0-9\u00a1-\uffff]"
	u"[a-z0-9\u00a1-\uffff_-]{0,62}"
	u")?"
	u"[a-z0-9\u00a1-\uffff]\."
	u")+"
	# TLD identifier name, may end with dot
	u"(?:[a-z\u00a1-\uffff]{2,}\.?)"
	u")"
	# port number (optional)
	u"(?::\d{2,5})?"
	# resource path (optional)
	u"(?:[/?#]\S*)?"
	u"$"
	, re.UNICODE \| re.I
	)