Last active
March 30, 2017 11:57
-
-
Save blubberdiblub/fc5656abd1514501ae0a1383ae3e3520 to your computer and use it in GitHub Desktop.
Measuring compiled regex vs. regex cache performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# There was no noticable difference at the time of testing: | |
# | |
# Testing string regex ... took 10.5 seconds | |
# Verification: good texts: 93307 bad texts: 6693 | |
# Testing compiled regex ... took 10.5 seconds | |
# Verification: good texts: 93307 bad texts: 6693 | |
import random | |
import re | |
import time | |
from functools import partial | |
def badwords_to_regex(badwords): | |
return r'\b(' + r'|'.join(re.escape(word) for word in badwords) + r')\b' | |
def generate_word(min_length=2, max_length=10): | |
chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
length = random.randint(min_length, max_length) | |
return ''.join(random.choice(chars) for _ in range(length)) | |
def generate_text(min_length=20, max_length=150): | |
chars = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
length = random.randint(min_length, max_length) | |
return ''.join(random.choice(chars) for _ in range(length)) | |
def generate_badwords(count=500): | |
return [generate_word() for _ in range(count)] | |
# change count in order to change the test duration | |
def generate_many_texts(count=100000): | |
return [generate_text() for _ in range(count)] | |
def measure_performance(name, texts, func): | |
print("Testing {} ... ".format(name), end="", flush=True) | |
count_bad = 0 | |
count_good = 0 | |
t1 = time.clock() | |
for text in texts: | |
if func(text) is None: | |
count_good += 1 | |
else: | |
count_bad += 1 | |
t2 = time.clock() | |
print("took {:.3g} seconds".format(t2 - t1)) | |
print("Verification: good texts: {} bad texts: {}".format(count_good, count_bad)) | |
def main(): | |
badwords = generate_badwords() | |
many_texts = generate_many_texts() | |
string_regex = badwords_to_regex(badwords) | |
compiled_regex = re.compile(string_regex, re.IGNORECASE) | |
measure_performance("string regex", | |
many_texts, | |
partial(re.search, string_regex, flags=re.IGNORECASE)) | |
measure_performance("compiled regex", | |
many_texts, | |
compiled_regex.search) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment