Created
September 20, 2018 09:47
-
-
Save pchc2005/b5f13e136a9c9bb2984e5b92802fc7c9 to your computer and use it in GitHub Desktop.
A python port for regex-weburl.js by @dperini
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
URL_REGEX = re.compile( | |
u"^" | |
# protocol identifier | |
u"(?:(?:(?:https?|ftp):)?//)" | |
# user:pass authentication | |
u"(?:\S+(?::\S*)?@)?" | |
u"(?:" | |
# IP address exclusion | |
# private & local networks | |
u"(?!(?:10|127)(?:\.\d{1,3}){3})" | |
u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" | |
u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" | |
# IP address dotted notation octets | |
# excludes loopback network 0.0.0.0 | |
# excludes reserved space >= 224.0.0.0 | |
# excludes network & broadcast addresses | |
# (first & last IP address of each class) | |
u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" | |
u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" | |
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" | |
u"|" | |
# host & domain names, may end with dot | |
# can be replaced by a shortest alternative | |
# u"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+" | |
# u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" | |
# # domain name | |
# u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" | |
u"(?:" | |
u"(?:" | |
u"[a-z0-9\u00a1-\uffff]" | |
u"[a-z0-9\u00a1-\uffff_-]{0,62}" | |
u")?" | |
u"[a-z0-9\u00a1-\uffff]\." | |
u")+" | |
# TLD identifier name, may end with dot | |
u"(?:[a-z\u00a1-\uffff]{2,}\.?)" | |
u")" | |
# port number (optional) | |
u"(?::\d{2,5})?" | |
# resource path (optional) | |
u"(?:[/?#]\S*)?" | |
u"$" | |
, re.UNICODE | re.I | |
) |
@doronhorwitz quoting this stack overflow answer:
The fact that you're seeing the u means you're on Python 2 - strings are Unicode by default on Python 3
So u can be safely removed in Python 3
Here is version with r
instead of u
:
import re
URL_REGEX = re.compile(
r"^"
# protocol identifier
r"(?:(?:(?:https?|ftp):)?//)"
# user:pass authentication
r"(?:\S+(?::\S*)?@)?"
r"(?:"
# IP address exclusion
# private & local networks
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host & domain names, may end with dot
# can be replaced by a shortest alternative
# r"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+"
# r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# # domain name
# r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
r"(?:"
r"(?:"
r"[a-z0-9\u00a1-\uffff]"
r"[a-z0-9\u00a1-\uffff_-]{0,62}"
r")?"
r"[a-z0-9\u00a1-\uffff]\."
r")+"
# TLD identifier name, may end with dot
r"(?:[a-z\u00a1-\uffff]{2,}\.?)"
r")"
# port number (optional)
r"(?::\d{2,5})?"
# resource path (optional)
r"(?:[/?#]\S*)?"
r"$"
, re.UNICODE | re.I
)
Got it, so you originally posted a Python 2 solution . So thanks now for the Python 3 version
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is awesome! Thank you @pchc2005
How come you use u-strings and not r-strings?
I get lots of "invalid escape sequence" PEP-8 warnings unless I replace the u's with r's