klenwell · March 10, 2016 17:30
diff --git a/url_detector.py b/url_detector.py
 # -*- coding: utf-8 -*-
 """
    URL Detection

    I'm looking for a roughly accurate URL counter for spam detection in user-submitted content.

    REFERENCES
    http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 """
 import re

 should_match = """\
 http://foo.com/blah_blah
 http://foo.com/blah_blah/
 https://foo.com/blah_blah
 https://foo.com/blah_blah/
 (Something like http://foo.com/blah_blah)
 http://foo.com/blah_blah_(wikipedia)
 http://foo.com/more_(than)_one_(parens)
 (Something like http://foo.com/blah_blah_(wikipedia))
 http://foo.com/blah_(wikipedia)#cite-1
 http://foo.com/blah_(wikipedia)_blah#cite-1
 http://foo.com/unicode_(✪)_in_parens
 http://foo.com/(something)?after=parens
 http://foo.com/blah_blah.
 http://foo.com/blah_blah/.
 <http://foo.com/blah_blah>
 <http://foo.com/blah_blah/>
 http://foo.com/blah_blah,
 http://www.extinguishedscholar.com/wpglob/?p=364.
 http://✪df.ws/1234
 rdar://1234
 rdar:/1234
 x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
 message://%[email protected]%3e
 http://➡.ws/䨹
 www.c.ws/䨹
 <tag>http://example.com</tag>
 Just a www.example.com link.
 http://example.com/something?with,commas,in,url, but not at end
 What about <mailto:[email protected]?subject=TEST> (including brokets).
 mailto:[email protected]
 bit.ly/foo
 “is.gd/foo/”
 WWW.EXAMPLE.COM
 http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
 http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
 http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))
 magnet:?xt=urn:btih:c12fe1c06bba254a9dc9f519b335aa7c1367a88a&dn"""

 should_fail = """\
 6:00p
 filename.txt"""

 known_to_fail = """\
 http://example.com/quotes-are-“part”
 ✪df.ws/1234
 example.com
 example.com/"""


 # Source: https://github.com/lepture/mistune/blob/master/mistune.py#L470
 MISTUNE_URL = r'''(https?:\/\/[^\s<]+[^<.,:;"')\]\s])'''

 # Source: https://gist.github.com/uogbuji/705383
 # Note: Returns groups: url is first item in group
 GRUBER_URL = ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \
             ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \
             ur'[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'

 # Source: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 GRUBER_ALL = ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|' \
             ur'[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \
             ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \
             ur'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'


 def test_positive():
    test_cases = (should_match + known_to_fail).split('\n')
    patterns = [
        ('Gruber URL', GRUBER_URL),
        ('Gruber All', GRUBER_ALL),
        ('Mistune', MISTUNE_URL)
    ]

    results = {}
    for name, pattern in patterns:
        print 'TESTING %s' % (name)
        detector = re.compile(pattern)

        results[name] = [0, 0]
        for test_case in test_cases:
            matches = detector.findall(test_case)
            if len(matches) < 1:
                print 'FAIL: %s' % (test_case)
                results[name][1] += 1
            else:
                print 'PASS: %s -> %s' % (test_case, matches)
                results[name][0] += 1

    for name, pattern in patterns:
        detector = re.compile(pattern)
        matches = detector.findall(should_match)
        print '%s finds %s of %s' % (name, len(matches), len(test_cases))
        #print matches

    print results


 def test_negative():
    test_cases = should_fail.split('\n')
    patterns = [
        ('Gruber URL', GRUBER_URL),
        ('Gruber All', GRUBER_ALL),
        ('Mistune', MISTUNE_URL)
    ]

    results = {}
    for name, pattern in patterns:
        print 'NEGATIVE TEST FOR %s' % (name)
        detector = re.compile(pattern)

        results[name] = [0, 0]
        for test_case in test_cases:
            matches = detector.findall(test_case)
            if len(matches) >= 1:
                print 'FAIL: %s' % (test_case)
                results[name][1] += 1
            else:
                print 'PASS: %s -> %s' % (test_case, matches)
                results[name][0] += 1

    print results

 if __name__ == '__main__':
    test_positive()
    test_negative()
	# -- coding: utf-8 --
	"""
	URL Detection

	I'm looking for a roughly accurate URL counter for spam detection in user-submitted content.

	REFERENCES
	http://daringfireball.net/2010/07/improved_regex_for_matching_urls
	"""
	import re

	should_match = """\
	http://foo.com/blah_blah
	http://foo.com/blah_blah/
	https://foo.com/blah_blah
	https://foo.com/blah_blah/
	(Something like http://foo.com/blah_blah)
	http://foo.com/blah_blah_(wikipedia)
	http://foo.com/more_(than)_one_(parens)
	(Something like http://foo.com/blah_blah_(wikipedia))
	http://foo.com/blah_(wikipedia)#cite-1
	http://foo.com/blah_(wikipedia)_blah#cite-1
	http://foo.com/unicode_(✪)_in_parens
	http://foo.com/(something)?after=parens
	http://foo.com/blah_blah.
	http://foo.com/blah_blah/.
	<http://foo.com/blah_blah>
	<http://foo.com/blah_blah/>
	http://foo.com/blah_blah,
	http://www.extinguishedscholar.com/wpglob/?p=364.
	http://✪df.ws/1234
	rdar://1234
	rdar:/1234
	x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
	message://%[email protected]%3e
	http://➡.ws/䨹
	www.c.ws/䨹
	<tag>http://example.com</tag>
	Just a www.example.com link.
	http://example.com/something?with,commas,in,url, but not at end
	What about <mailto:[email protected]?subject=TEST> (including brokets).
	mailto:[email protected]
	bit.ly/foo
	“is.gd/foo/”
	WWW.EXAMPLE.COM
	http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
	http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
	http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))
	magnet:?xt=urn:btih:c12fe1c06bba254a9dc9f519b335aa7c1367a88a&dn"""

	should_fail = """\
	6:00p
	filename.txt"""

	known_to_fail = """\
	http://example.com/quotes-are-“part”
	✪df.ws/1234
	example.com
	example.com/"""


	# Source: https://github.com/lepture/mistune/blob/master/mistune.py#L470
	MISTUNE_URL = r'''(https?:\/\/[^\s<]+[^<.,:;"')\]\s])'''

	# Source: https://gist.github.com/uogbuji/705383
	# Note: Returns groups: url is first item in group
	GRUBER_URL = ur'(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|' \
	ur'\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|' \
	ur'[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'

	# Source: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
	GRUBER_ALL = ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}\|' \
	ur'[a-z0-9%])\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|' \
	ur'\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|' \
	ur'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'


	def test_positive():
	test_cases = (should_match + known_to_fail).split('\n')
	patterns = [
	('Gruber URL', GRUBER_URL),
	('Gruber All', GRUBER_ALL),
	('Mistune', MISTUNE_URL)
	]

	results = {}
	for name, pattern in patterns:
	print 'TESTING %s' % (name)
	detector = re.compile(pattern)

	results[name] = [0, 0]
	for test_case in test_cases:
	matches = detector.findall(test_case)
	if len(matches) < 1:
	print 'FAIL: %s' % (test_case)
	results[name][1] += 1
	else:
	print 'PASS: %s -> %s' % (test_case, matches)
	results[name][0] += 1

	for name, pattern in patterns:
	detector = re.compile(pattern)
	matches = detector.findall(should_match)
	print '%s finds %s of %s' % (name, len(matches), len(test_cases))
	#print matches

	print results


	def test_negative():
	test_cases = should_fail.split('\n')
	patterns = [
	('Gruber URL', GRUBER_URL),
	('Gruber All', GRUBER_ALL),
	('Mistune', MISTUNE_URL)
	]

	results = {}
	for name, pattern in patterns:
	print 'NEGATIVE TEST FOR %s' % (name)
	detector = re.compile(pattern)

	results[name] = [0, 0]
	for test_case in test_cases:
	matches = detector.findall(test_case)
	if len(matches) >= 1:
	print 'FAIL: %s' % (test_case)
	results[name][1] += 1
	else:
	print 'PASS: %s -> %s' % (test_case, matches)
	results[name][0] += 1

	print results

	if __name__ == '__main__':
	test_positive()
	test_negative()
No results found