Created
September 20, 2018 09:47
-
-
Save pchc2005/b5f13e136a9c9bb2984e5b92802fc7c9 to your computer and use it in GitHub Desktop.
A python port for regex-weburl.js by @dperini
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
URL_REGEX = re.compile( | |
u"^" | |
# protocol identifier | |
u"(?:(?:(?:https?|ftp):)?//)" | |
# user:pass authentication | |
u"(?:\S+(?::\S*)?@)?" | |
u"(?:" | |
# IP address exclusion | |
# private & local networks | |
u"(?!(?:10|127)(?:\.\d{1,3}){3})" | |
u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" | |
u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" | |
# IP address dotted notation octets | |
# excludes loopback network 0.0.0.0 | |
# excludes reserved space >= 224.0.0.0 | |
# excludes network & broadcast addresses | |
# (first & last IP address of each class) | |
u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" | |
u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" | |
u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" | |
u"|" | |
# host & domain names, may end with dot | |
# can be replaced by a shortest alternative | |
# u"(?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+" | |
# u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" | |
# # domain name | |
# u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" | |
u"(?:" | |
u"(?:" | |
u"[a-z0-9\u00a1-\uffff]" | |
u"[a-z0-9\u00a1-\uffff_-]{0,62}" | |
u")?" | |
u"[a-z0-9\u00a1-\uffff]\." | |
u")+" | |
# TLD identifier name, may end with dot | |
u"(?:[a-z\u00a1-\uffff]{2,}\.?)" | |
u")" | |
# port number (optional) | |
u"(?::\d{2,5})?" | |
# resource path (optional) | |
u"(?:[/?#]\S*)?" | |
u"$" | |
, re.UNICODE | re.I | |
) |
Got it, so you originally posted a Python 2 solution . So thanks now for the Python 3 version
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@doronhorwitz quoting this stack overflow answer:
Here is version with
r
instead ofu
: