Created
March 10, 2016 17:30
-
-
Save klenwell/6feb6b9cd5dac679048e to your computer and use it in GitHub Desktop.
Python script testing regular expressions to detect URLs in strings.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
URL Detection | |
I'm looking for a roughly accurate URL counter for spam detection in user-submitted content. | |
REFERENCES | |
http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
""" | |
import re | |
should_match = """\ | |
http://foo.com/blah_blah | |
http://foo.com/blah_blah/ | |
https://foo.com/blah_blah | |
https://foo.com/blah_blah/ | |
(Something like http://foo.com/blah_blah) | |
http://foo.com/blah_blah_(wikipedia) | |
http://foo.com/more_(than)_one_(parens) | |
(Something like http://foo.com/blah_blah_(wikipedia)) | |
http://foo.com/blah_(wikipedia)#cite-1 | |
http://foo.com/blah_(wikipedia)_blah#cite-1 | |
http://foo.com/unicode_(✪)_in_parens | |
http://foo.com/(something)?after=parens | |
http://foo.com/blah_blah. | |
http://foo.com/blah_blah/. | |
<http://foo.com/blah_blah> | |
<http://foo.com/blah_blah/> | |
http://foo.com/blah_blah, | |
http://www.extinguishedscholar.com/wpglob/?p=364. | |
http://✪df.ws/1234 | |
rdar://1234 | |
rdar:/1234 | |
x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E | |
message://%[email protected]%3e | |
http://➡.ws/䨹 | |
www.c.ws/䨹 | |
<tag>http://example.com</tag> | |
Just a www.example.com link. | |
http://example.com/something?with,commas,in,url, but not at end | |
What about <mailto:[email protected]?subject=TEST> (including brokets). | |
mailto:[email protected] | |
bit.ly/foo | |
“is.gd/foo/” | |
WWW.EXAMPLE.COM | |
http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752 | |
http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55)) | |
http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634)) | |
magnet:?xt=urn:btih:c12fe1c06bba254a9dc9f519b335aa7c1367a88a&dn""" | |
should_fail = """\ | |
6:00p | |
filename.txt""" | |
known_to_fail = """\ | |
http://example.com/quotes-are-“part” | |
✪df.ws/1234 | |
example.com | |
example.com/""" | |
# Source: https://github.com/lepture/mistune/blob/master/mistune.py#L470 | |
MISTUNE_URL = r'''(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''' | |
# Source: https://gist.github.com/uogbuji/705383 | |
# Note: Returns groups: url is first item in group | |
GRUBER_URL = ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \ | |
ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \ | |
ur'[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))' | |
# Source: http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
GRUBER_ALL = ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|' \ | |
ur'[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \ | |
ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \ | |
ur'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))' | |
def test_positive(): | |
test_cases = (should_match + known_to_fail).split('\n') | |
patterns = [ | |
('Gruber URL', GRUBER_URL), | |
('Gruber All', GRUBER_ALL), | |
('Mistune', MISTUNE_URL) | |
] | |
results = {} | |
for name, pattern in patterns: | |
print 'TESTING %s' % (name) | |
detector = re.compile(pattern) | |
results[name] = [0, 0] | |
for test_case in test_cases: | |
matches = detector.findall(test_case) | |
if len(matches) < 1: | |
print 'FAIL: %s' % (test_case) | |
results[name][1] += 1 | |
else: | |
print 'PASS: %s -> %s' % (test_case, matches) | |
results[name][0] += 1 | |
for name, pattern in patterns: | |
detector = re.compile(pattern) | |
matches = detector.findall(should_match) | |
print '%s finds %s of %s' % (name, len(matches), len(test_cases)) | |
#print matches | |
print results | |
def test_negative(): | |
test_cases = should_fail.split('\n') | |
patterns = [ | |
('Gruber URL', GRUBER_URL), | |
('Gruber All', GRUBER_ALL), | |
('Mistune', MISTUNE_URL) | |
] | |
results = {} | |
for name, pattern in patterns: | |
print 'NEGATIVE TEST FOR %s' % (name) | |
detector = re.compile(pattern) | |
results[name] = [0, 0] | |
for test_case in test_cases: | |
matches = detector.findall(test_case) | |
if len(matches) >= 1: | |
print 'FAIL: %s' % (test_case) | |
results[name][1] += 1 | |
else: | |
print 'PASS: %s -> %s' % (test_case, matches) | |
results[name][0] += 1 | |
print results | |
if __name__ == '__main__': | |
test_positive() | |
test_negative() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment