Created
December 23, 2009 18:03
-
-
Save DataWraith/262667 to your computer and use it in GitHub Desktop.
A simple Trie to find words that match a simple pattern
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TrieNode | |
def initialize | |
@sub_tries = {} | |
@end_of_word = false | |
end | |
def insert(input) | |
to_insert = input | |
# Convert input to array of chars if it's a string | |
to_insert = to_insert.split(//) if to_insert.is_a?(String) | |
# If to_insert is empty, we have reached the end of the word and add a | |
# end-of-word marker. Otherwise we would get partial words back when we're | |
# looking for a shorter word, for example "coverage", "cover?" => "covera" | |
if to_insert.empty? | |
@end_of_word = true | |
return | |
end | |
# Otherwise, create a sub_trie for the first character of the input and | |
# recursively insert the remainder of the word | |
cur_char = to_insert.shift | |
@sub_tries[cur_char] ||= TrieNode.new | |
@sub_tries[cur_char].insert(to_insert) | |
to_insert.unshift(cur_char) | |
end | |
def find_words(word) | |
to_find = word | |
to_find = to_find.split(//) if to_find.is_a?(String) | |
# If to_find is empty, we reached the end of the template. If the | |
# end-of-word marker is set, we return the empty string, which will be | |
# prepended with the characters of the word as we move upwards in the trie. | |
# Otherwise we return the empty array, which results in no word being added | |
# to the results. | |
if to_find.empty? | |
return [''] if @end_of_word | |
return [] | |
end | |
cur_char = to_find.shift | |
result = [] | |
if cur_char == '?' | |
# Search all sub-tries for words matching the remainder of the template | |
# and add them to the results | |
@sub_tries.each_pair do |key, value| | |
result += value.find_words(to_find).map { |w| key + w } | |
end | |
elsif @sub_tries[cur_char] | |
# We have a sub-trie that matches the current character of the template. | |
# Recursively get all words that match the remainder of the template | |
# below that sub-trie. | |
result = @sub_tries[cur_char].find_words(to_find).map { |w| cur_char + w } | |
else | |
# We don't have anything that matches the template. | |
result = [] | |
end | |
to_find.unshift(cur_char) | |
return result | |
end | |
end | |
DICTIONARY = File.read('/usr/share/dict/usa').split("\n") | |
print "Building Trie... " | |
@root = TrieNode.new | |
DICTIONARY.each { |word| @root.insert(word) } | |
puts "done." | |
puts "Searching trie..." | |
@root.find_words('?r?te').each do |word| | |
puts word | |
end | |
# $wc -l /usr/share/dict/usa | |
# 98569 /usr/share/dict/usa | |
# | |
# $time ruby trie.rb | |
# Building Trie... done. | |
# Searching trie... | |
# Crete | |
# brute | |
# crate | |
# grate | |
# irate | |
# orate | |
# prate | |
# trite | |
# write | |
# wrote | |
# | |
# real 0m3.561s | |
# user 0m3.303s | |
# sys 0m0.117s | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment