Last active
April 26, 2023 11:13
-
-
Save durden/4504120 to your computer and use it in GitHub Desktop.
Different ways to 'sanitize' a list of strings to ensure they don't contain any of the contents of another 'ignored' list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Demonstration of ways to implement this API: | |
sanitize(unsanitized_input, ignored_words) | |
Related discussions: | |
- Modifying a list while looping over it: | |
- http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python | |
- Remove all occurences of a value in a list: | |
- http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list | |
""" | |
# 1. Using set difference | |
def sanitize_1(unsanitized_input, ignored_words): | |
""" | |
Take in two lists and santize first list to ensure it doesn't contain any | |
items from the second list, return sanitized list | |
""" | |
# Downsides: | |
# - Sets are unordered so if user_input was a sentence we lose the | |
# ordering because set difference will create a new set and not | |
# maintain ordering. | |
return list(set(unsanitized_input) - set(ignored_words)) | |
# 2. Using sets and instersection() | |
def sanitize_2(unsanitized_input, ignored_words): | |
""" | |
Take in two lists and santize first list to ensure it doesn't contain any | |
items from the second list, return sanitized list | |
""" | |
# Downsides: | |
# - We must loop over words individually, which could be slower. | |
# - Doesn't handle if an ignore word is present more than once. | |
ignored_words = set(ignored_words) | |
for ignore in ignored_words.intersection(unsanitized_input): | |
unsanitized_input.remove(ignore) | |
return unsanitized_input | |
# 3. Using sets and instersection() with ability to remove ALL ignored words. | |
def sanitize_3(unsanitized_input, ignored_words): | |
""" | |
Take in two lists and santize first list to ensure it doesn't contain any | |
items from the second list, return sanitized list | |
""" | |
# Downsides: | |
# - Looping over list while removing from it? | |
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python | |
ignored_words = set(ignored_words) | |
for ignore in ignored_words.intersection(unsanitized_input): | |
while ignore in unsanitized_input: | |
unsanitized_input.remove(ignore) | |
return unsanitized_input | |
# 4. Using #3 with knowledge of list remove/mutable semantics considered | |
def sanitize_4(unsanitized_input, ignored_words): | |
""" | |
Take in two lists and santize first list to ensure it doesn't contain any | |
items from the second list, return sanitized list | |
""" | |
# Downsides: | |
# - Looping over list while removing from it? | |
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python | |
ignored_words = set(ignored_words) | |
for ignore in ignored_words.intersection(unsanitized_input): | |
while ignore in unsanitized_input: | |
unsanitized_input.remove(ignore) | |
# Note: The return is not technically necessary because remove() does an | |
# in-place removal so the original list passed in is modified. | |
# 5. Using #4 without implicit side-effect of modifying list user passed in. | |
# This demonstrates the functional paradigm of functions never modifiying | |
# input parameters and always having the same output for the same input. | |
def sanitize_5(unsanitized_input, ignored_words): | |
""" | |
Take in two lists and santize first list to ensure it doesn't contain any | |
items from the second list, return sanitized list | |
""" | |
# Downsides: | |
# - Requires extra memory usage because original unsanitized_input is | |
# copied. Also, the final memory usage is increased in this version by | |
# the number of words/characters that are NOT in the ignored_words | |
# because those are removed. | |
# Copy input list (various ways to accomplish, but list() syntax is more | |
# clear IMO) | |
unsanitized_input = list(unsanitized_input) | |
# unsanitized_input[:] = unsanitized_input | |
# import copy | |
# unsanitized_input = copy(unsanitized_input) | |
ignored_words = set(ignored_words) | |
for ignore in ignored_words.intersection(unsanitized_input): | |
while ignore in unsanitized_input: | |
unsanitized_input.remove(ignore) | |
# Return is required now b/c we didn't touch the original list, we have our | |
# own local copy that is unsanitized | |
return unsanitized_input | |
def check_results(): | |
user_input = 'the cat walked down the road.'.split() | |
ignore_words = ['the', 'um', 'eh'] | |
print 'sanitize_1', sanitize_1(user_input, ignore_words) | |
print 'sanitize_2', sanitize_2(user_input, ignore_words) | |
print 'sanitize_3', sanitize_3(user_input, ignore_words) | |
# Create new list so sanitize_4 gets fresh copy of words to ignore instead | |
# of an already sanitized list (previous calls modified original | |
# user_input) | |
user_input = 'the cat walked down the road.'.split() | |
sanitize_4(user_input, ignore_words) | |
print 'sanitize_4', user_input | |
# Start fresh again, previous call modified in-place again. | |
user_input = 'the cat walked down the road.'.split() | |
user_input_returned = sanitize_5(user_input, ignore_words) | |
print 'sanitize_5', user_input_returned | |
print ' original list', user_input, id(user_input) | |
print ' sanitized list', user_input_returned, id(user_input_returned) | |
def check_performance(): | |
from timeit import timeit | |
from functools import partial | |
import sys | |
def _sanitize_funcs(): | |
module = sys.modules[__name__] | |
# Loop over them in order so we can print them as we 'improved' the | |
# algorithm | |
for obj in sorted(vars(module).values()): | |
if callable(obj) and obj.__name__.startswith('sanitize_'): | |
yield obj | |
# This is needed to setup the local variables and have a context for | |
# the function. | |
ignore_words = ['the', 'eh' 'um', 'ugh'] | |
lots_of_input = 'the cat walked down the road.' | |
user_input = lots_of_input.split() | |
for func in _sanitize_funcs(): | |
print '%s %f secs' % (func.__name__, | |
timeit(partial(func, user_input, ignore_words))) | |
def main(): | |
print '----- Function Results -----' | |
check_results() | |
print '----- Function Performance -----' | |
check_performance() | |
if __name__ == "__main__": | |
main() |
There is a scoping issue so I changed the following:
# This is needed to setup the local variables and have a context for
# the function.
ignore_words = ['the', 'um', 'eh']
lots_of_input = 'the cat walked down the road.'
for func in _sanitize_funcs():
user_input = lots_of_input.split()
print '%s %f secs' % (func.__name__,
timeit(partial(func, user_input, ignore_words)))
Notice where I moved user_input to.
I also added three more function to test other patterns:
def sanitize_6(unsanitized_input, ignored_words):
new_list = []
for w in unsanitized_input:
if w not in ignored_words:
new_list.append(w)
return new_list
def sanitize_7(unsanitized_input, ignored_words):
return [w for w in unsanitized_input if w not in ignored_words]
def sanitize_8(unsanitized_input, ignored_words):
for w in ignored_words:
try:
unsanitized_input.pop(unsanitized_input.index(w))
except ValueError:
pass
return unsanitized_input
Here are the results:
$ python sanitize.py
----- Function Results -----
sanitize_1 ['down', 'road.', 'walked', 'cat']
sanitize_2 ['cat', 'walked', 'down', 'the', 'road.']
sanitize_3 ['cat', 'walked', 'down', 'road.']
sanitize_4 ['cat', 'walked', 'down', 'road.']
sanitize_5 ['cat', 'walked', 'down', 'road.']
sanitize_6 ['cat', 'walked', 'down', 'road.']
sanitize_7 ['cat', 'walked', 'down', 'road.']
sanitize_8 ['cat', 'walked', 'down', 'road.']
original list ['the', 'cat', 'walked', 'down', 'the', 'road.'] 4337426728
sanitized list ['cat', 'walked', 'down', 'road.'] 4337618024
----- Function Performance -----
sanitize_2 0.965178 secs
sanitize_3 0.972548 secs
sanitize_4 0.971117 secs
sanitize_1 1.561817 secs
sanitize_6 1.808824 secs
sanitize_5 2.613811 secs
sanitize_7 1.333645 secs
sanitize_8 7.508463 secs
sanitize_8 is broken :/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i got significantly different results on my Macbook Air:
----- Function Results -----
sanitize_1 ['down', 'road.', 'walked', 'cat']
sanitize_2 ['cat', 'walked', 'down', 'the', 'road.']
sanitize_3 ['cat', 'walked', 'down', 'road.']
sanitize_4 ['cat', 'walked', 'down', 'road.']
sanitize_5 ['cat', 'walked', 'down', 'road.']
original list ['the', 'cat', 'walked', 'down', 'the', 'road.'] 4353351976
sanitized list ['cat', 'walked', 'down', 'road.'] 4353542768
----- Function Performance -----
sanitize_2 0.950371 secs
sanitize_3 0.955089 secs
sanitize_4 0.980491 secs
sanitize_1 1.505497 secs
sanitize_5 1.411789 secs
Note that sanitize_2 didn't even work.