Last active
March 28, 2016 22:49
-
-
Save astockwell/ba6b671cc0dcd3136eb3 to your computer and use it in GitHub Desktop.
The power of Trie's -- list search: 0.436379 seconds, trie search: 0.0003269999999999662 seconds.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import time | |
pattern_words_only = re.compile('[^A-Za-z0-9 ]+', re.UNICODE) | |
dictionary_words = [line.rstrip('\n') for line in open('/usr/share/dict/words')] | |
# print(len(dictionary_words)) | |
two_cities = 'It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way - in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever.' | |
two_cities_words = re.sub(pattern_words_only, '', two_cities).lower().split(' ') | |
# print(len(two_cities_words)) | |
start_time = time.clock() | |
for word in two_cities_words: | |
if word not in dictionary_words: | |
print(word) | |
print(time.clock() - start_time, "seconds") | |
# (0.436379, 'seconds') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import time | |
pattern_words_only = re.compile('[^A-Za-z0-9 ]+', re.UNICODE) | |
dictionary_words = [line.rstrip('\n') for line in open('/usr/share/dict/words')] | |
# print(len(dictionary_words)) | |
_end = '_end_' | |
# Reference implementation: http://stackoverflow.com/a/11016430/1316499 | |
def make_trie(words): | |
root = dict() | |
for word in words: | |
current_dict = root | |
for letter in word: | |
current_dict = current_dict.setdefault(letter, {}) | |
current_dict[_end] = _end | |
return root | |
def in_trie(trie, word): | |
current_dict = trie | |
for letter in word: | |
if letter in current_dict: | |
current_dict = current_dict[letter] | |
else: | |
return False | |
else: | |
if _end in current_dict: | |
return True | |
else: | |
return False | |
# Luckily only have to do this once: | |
dictionary_trie = make_trie(dictionary_words) | |
two_cities = 'It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way - in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever.' | |
two_cities_words = re.sub(pattern_words_only, '', two_cities).lower().split(' ') | |
# print(len(two_cities_words)) | |
start_time = time.clock() | |
for word in two_cities_words: | |
if in_trie(dictionary_trie, word) is False: | |
print(word) | |
print(time.clock() - start_time, "seconds") | |
# (0.0003269999999999662, 'seconds') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment