Skip to content

Instantly share code, notes, and snippets.

@blubberdiblub
Last active March 30, 2017 11:57
Show Gist options
  • Save blubberdiblub/fc5656abd1514501ae0a1383ae3e3520 to your computer and use it in GitHub Desktop.
Save blubberdiblub/fc5656abd1514501ae0a1383ae3e3520 to your computer and use it in GitHub Desktop.
Measuring compiled regex vs. regex cache performance
#!/usr/bin/env python3
# There was no noticable difference at the time of testing:
#
# Testing string regex ... took 10.5 seconds
# Verification: good texts: 93307 bad texts: 6693
# Testing compiled regex ... took 10.5 seconds
# Verification: good texts: 93307 bad texts: 6693
import random
import re
import time
from functools import partial
def badwords_to_regex(badwords):
return r'\b(' + r'|'.join(re.escape(word) for word in badwords) + r')\b'
def generate_word(min_length=2, max_length=10):
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
length = random.randint(min_length, max_length)
return ''.join(random.choice(chars) for _ in range(length))
def generate_text(min_length=20, max_length=150):
chars = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
length = random.randint(min_length, max_length)
return ''.join(random.choice(chars) for _ in range(length))
def generate_badwords(count=500):
return [generate_word() for _ in range(count)]
# change count in order to change the test duration
def generate_many_texts(count=100000):
return [generate_text() for _ in range(count)]
def measure_performance(name, texts, func):
print("Testing {} ... ".format(name), end="", flush=True)
count_bad = 0
count_good = 0
t1 = time.clock()
for text in texts:
if func(text) is None:
count_good += 1
else:
count_bad += 1
t2 = time.clock()
print("took {:.3g} seconds".format(t2 - t1))
print("Verification: good texts: {} bad texts: {}".format(count_good, count_bad))
def main():
badwords = generate_badwords()
many_texts = generate_many_texts()
string_regex = badwords_to_regex(badwords)
compiled_regex = re.compile(string_regex, re.IGNORECASE)
measure_performance("string regex",
many_texts,
partial(re.search, string_regex, flags=re.IGNORECASE))
measure_performance("compiled regex",
many_texts,
compiled_regex.search)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment