Last active
August 29, 2015 14:09
-
-
Save neotrinity/e54f2a132b00e1090965 to your computer and use it in GitHub Desktop.
Split string to words problem using a trie. Trie.tokenize method takes in a string and returns the tokenized string if we can successfully match else it returns back an empty string. This is a recursive solution and goes through all possible approaches. It greedily looks for the maximum length words to tokenize on and if it fails, then it takes …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TrieNode(object): | |
""" | |
Trie Node | |
""" | |
def __init__(self,key): | |
self._key = key | |
self._value = None | |
self._children = {} | |
self._isWord = False | |
isWord = property( lambda self: self._isWord, | |
lambda self, value: setattr(self,'_isWord',value), | |
doc = """ Property to get/set _isWord """) | |
class Trie(object): | |
""" | |
Standard Trie Implementation | |
""" | |
def __init__(self): | |
self._root = TrieNode("") | |
def add(self, word, value=None): | |
""" | |
Add word into the trie with an optional payload | |
""" | |
letters = list(word) | |
node = self._root | |
for letter in letters: | |
if letter not in node._children: | |
node._children[letter] = TrieNode(letter) | |
node = node._children[letter] | |
node.isWord = True # Flag the word entry | |
node._value = value | |
def find(self, word): | |
""" | |
Find a given word in the Trie, | |
if payload is present then return the payload | |
else return bool for the presence of the word | |
""" | |
letters = list(word) | |
node = self._root | |
for letter in letters: | |
if letter not in node._children: | |
return False | |
node = node._children[letter] | |
if node.isWord: | |
if node._value: | |
return node._value | |
else: | |
return True | |
else: | |
return False | |
def remove(self, word): | |
""" | |
Remove the word from the Trie | |
""" | |
letters = list(word) | |
node = self._root | |
leastCommonPrefix = node | |
toDel = letters[0] | |
for letter in letters: | |
if letter not in node._children: | |
print "%s Not Found" % word | |
return None | |
if len(node._children.keys()) > 1: | |
leastCommonPrefix = node | |
toDel = letter | |
node = node._children[letter] | |
if node.isWord: | |
node = None | |
del leastCommonPrefix._children[toDel] # Delete the reference | |
leastCommonPrefix = None | |
print "Removed %s" % word | |
return None | |
else: | |
print "This search string is NOT a word" | |
return None | |
def showWords(self, fromNode = None, prefix = ""): | |
""" | |
Returns a list of all the words held by the Trie | |
""" | |
if fromNode: | |
node = fromNode | |
else: | |
node = self._root | |
words = [] | |
def preOrderTraversal(node,word): | |
word += node._key | |
if node.isWord: | |
words.append(word) # valid word, add it to the list | |
if not node._children: # reached the leaf | |
word = prefix # reset the word with the prefix | |
for letter, n in node._children.items(): | |
preOrderTraversal(n,word) | |
preOrderTraversal(node,prefix) | |
return words | |
def autocomplete(self,wordPrefix): | |
""" | |
Given a word prefix, this fetches all the words held in the | |
Trie which start with the prefix | |
""" | |
letters = list(wordPrefix) | |
node = self._root | |
for letter in letters: | |
if letter not in node._children: | |
return None | |
node = node._children[letter] | |
return self.showWords(node,wordPrefix[:-1]) | |
def tokenize(self, string): | |
""" | |
Given a string tokenize it into words present in the Trie. | |
eg: peanutbutter -> peanut butter | |
peanutmeg -> pea + nutmeg | |
""" | |
def _tokenize(index, node=None, acc=[]): | |
if node is None: | |
node = self._root | |
if index >= len(string): | |
if node.isWord: | |
return acc | |
else: | |
return [] | |
else: | |
letter = string[index] | |
if letter in node._children: | |
if node.isWord: | |
# be greedy and go for the maximum match ie., favour peanut over pea | |
acc1 = _tokenize(index+1, node=node._children[letter], acc=[letter]) | |
if acc1: | |
## if that recursive branch succeeds then return the result | |
acc.extend(acc1) | |
return acc | |
else: | |
## because the above recursion failed be less greedy and take | |
## the next best match path. eg in case of peanutmeg, peanut + meg fails | |
## so try pea + nutmeg | |
acc.append(" ") | |
return _tokenize(index, node=None, acc=acc) | |
else: | |
acc.append(letter) | |
return _tokenize(index+1, node=node._children[letter], acc=acc) | |
else: | |
if node.isWord: | |
acc.append(" ") | |
return _tokenize(index, node=None, acc=acc) | |
else: | |
return [] | |
acc = _tokenize(index=0) | |
return "".join(acc) | |
if __name__ == '__main__': | |
#Testing the Trie | |
t = Trie() | |
t.add('goose') | |
t.add('going') | |
t.add('porky') | |
t.add('poodle') | |
t.add('poker') | |
t.add('pea') | |
t.add('peanut') | |
t.add('butter') | |
t.add('but') | |
t.add('nut') | |
t.add('nutmeg') | |
print " Trie is created with the following words " | |
print t.showWords() | |
print "trying ::: tokenization" | |
print t.tokenize("porkygoose") | |
print t.tokenize("peanutmeg") | |
print t.tokenize("peanutbutter") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment