stringertheory · September 29, 2018 14:29
diff --git a/lenient_url_scrub.py b/lenient_url_scrub.py
 import re
 import scrubadub


 class UrlFilth(scrubadub.filth.url.UrlFilth):

    regex = re.compile(r'''
        (?P<protocol>
            (https?:\/\/(www\.)?|www\.)?         # protocol http://, etc
        )(?P<domain>
            [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
            /?                                   # can have a trailing slash
        )(?P<path>
            [\-\w@:%\+\.~\#?&/=]*                # rest of path, query, & hash
        )
    ''', re.VERBOSE)


 class UrlDetector(scrubadub.detectors.base.RegexDetector):
    filth_cls = UrlFilth


 SCRUBBER = scrubadub.Scrubber()
 SCRUBBER.remove_detector('name')
 SCRUBBER.remove_detector('url')
 SCRUBBER.add_detector(UrlDetector)

 SCRUBBER.clean(u'''
 Link 1: https://example.com
 Link 2: example.com
 Email: alice@example.com
 ''')
 # Gives:
 # Link 1: {{URL}}
 # Link 2: {{URL}}
 # Email: {{URL+EMAIL}}
	import re
	import scrubadub


	class UrlFilth(scrubadub.filth.url.UrlFilth):

	regex = re.compile(r'''
	(?P<protocol>
	(https?:\/\/(www\.)?\|www\.)? # protocol http://, etc
	)(?P<domain>
	[\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
	/? # can have a trailing slash
	)(?P<path>
	[\-\w@:%\+\.~\#?&/=]* # rest of path, query, & hash
	)
	''', re.VERBOSE)


	class UrlDetector(scrubadub.detectors.base.RegexDetector):
	filth_cls = UrlFilth


	SCRUBBER = scrubadub.Scrubber()
	SCRUBBER.remove_detector('name')
	SCRUBBER.remove_detector('url')
	SCRUBBER.add_detector(UrlDetector)

	SCRUBBER.clean(u'''
	Link 1: https://example.com
	Link 2: example.com
	Email: alice@example.com
	''')
	# Gives:
	# Link 1: {{URL}}
	# Link 2: {{URL}}
	# Email: {{URL+EMAIL}}
No results found