blubberdiblub · March 30, 2017 11:57
diff --git a/badwords.py b/badwords.py
 #!/usr/bin/env python3

 # There was no noticable difference at the time of testing:
 #
 # Testing string regex ... took 10.5 seconds
 # Verification:   good texts: 93307   bad texts: 6693
 # Testing compiled regex ... took 10.5 seconds
 # Verification:   good texts: 93307   bad texts: 6693

 import random
 import re
 import time

 from functools import partial


 def badwords_to_regex(badwords):
    return r'\b(' + r'|'.join(re.escape(word) for word in badwords) + r')\b'


 def generate_word(min_length=2, max_length=10):
    chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    length = random.randint(min_length, max_length)
    return ''.join(random.choice(chars) for _ in range(length))


 def generate_text(min_length=20, max_length=150):
    chars = "      ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    length = random.randint(min_length, max_length)
    return ''.join(random.choice(chars) for _ in range(length))


 def generate_badwords(count=500):
    return [generate_word() for _ in range(count)]


 # change count in order to change the test duration
 def generate_many_texts(count=100000):
    return [generate_text() for _ in range(count)]


 def measure_performance(name, texts, func):
    print("Testing {} ... ".format(name), end="", flush=True)

    count_bad = 0
    count_good = 0

    t1 = time.clock()

    for text in texts:
        if func(text) is None:
            count_good += 1
        else:
            count_bad += 1

    t2 = time.clock()

    print("took {:.3g} seconds".format(t2 - t1))
    print("Verification:   good texts: {}   bad texts: {}".format(count_good, count_bad))


 def main():
    badwords = generate_badwords()
    many_texts = generate_many_texts()

    string_regex = badwords_to_regex(badwords)
    compiled_regex = re.compile(string_regex, re.IGNORECASE)

    measure_performance("string regex",
                        many_texts,
                        partial(re.search, string_regex, flags=re.IGNORECASE))

    measure_performance("compiled regex",
                        many_texts,
                        compiled_regex.search)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	# There was no noticable difference at the time of testing:
	#
	# Testing string regex ... took 10.5 seconds
	# Verification: good texts: 93307 bad texts: 6693
	# Testing compiled regex ... took 10.5 seconds
	# Verification: good texts: 93307 bad texts: 6693

	import random
	import re
	import time

	from functools import partial


	def badwords_to_regex(badwords):
	return r'\b(' + r'\|'.join(re.escape(word) for word in badwords) + r')\b'


	def generate_word(min_length=2, max_length=10):
	chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
	length = random.randint(min_length, max_length)
	return ''.join(random.choice(chars) for _ in range(length))


	def generate_text(min_length=20, max_length=150):
	chars = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
	length = random.randint(min_length, max_length)
	return ''.join(random.choice(chars) for _ in range(length))


	def generate_badwords(count=500):
	return [generate_word() for _ in range(count)]


	# change count in order to change the test duration
	def generate_many_texts(count=100000):
	return [generate_text() for _ in range(count)]


	def measure_performance(name, texts, func):
	print("Testing {} ... ".format(name), end="", flush=True)

	count_bad = 0
	count_good = 0

	t1 = time.clock()

	for text in texts:
	if func(text) is None:
	count_good += 1
	else:
	count_bad += 1

	t2 = time.clock()

	print("took {:.3g} seconds".format(t2 - t1))
	print("Verification: good texts: {} bad texts: {}".format(count_good, count_bad))


	def main():
	badwords = generate_badwords()
	many_texts = generate_many_texts()

	string_regex = badwords_to_regex(badwords)
	compiled_regex = re.compile(string_regex, re.IGNORECASE)

	measure_performance("string regex",
	many_texts,
	partial(re.search, string_regex, flags=re.IGNORECASE))

	measure_performance("compiled regex",
	many_texts,
	compiled_regex.search)


	if __name__ == '__main__':
	main()