neotrinity · August 29, 2015 14:09
diff --git a/trie_tokenize.py b/trie_tokenize.py

 class TrieNode(object):
    """
          Trie Node
    """

    def __init__(self,key):
        self._key = key
        self._value = None
        self._children = {}
        self._isWord = False

    isWord = property( lambda self: self._isWord,
                       lambda self, value: setattr(self,'_isWord',value),
                       doc = """ Property to get/set _isWord """)


 class Trie(object):
    """
         Standard Trie Implementation
    """

    def __init__(self):
        self._root = TrieNode("")

    def add(self, word, value=None):
        """
            Add word into the trie with an optional payload
        """
        letters = list(word)
        node = self._root

        for letter in letters:
            if letter not in node._children:
                node._children[letter] = TrieNode(letter)
            node = node._children[letter]

        node.isWord = True               # Flag the word entry
        node._value = value

    def find(self, word):
        """
            Find a given word in the Trie,
            if payload is present then return the payload
            else return bool for the presence of the word
        """
        letters = list(word)
        node = self._root

        for letter in letters:
            if letter not in node._children:
                return False
            node = node._children[letter]
        if node.isWord:
            if node._value:
                return node._value
            else:
                return True
        else:
            return False

    def remove(self, word):
        """
            Remove the word from the Trie
        """
        letters = list(word)
        node = self._root
        leastCommonPrefix = node
        toDel = letters[0]

        for letter in letters:
            if letter not in node._children:
                print "%s Not Found" % word
                return None

            if len(node._children.keys()) >  1:
                leastCommonPrefix = node
                toDel = letter
            node = node._children[letter]

        if node.isWord:
            node = None
            del leastCommonPrefix._children[toDel] # Delete the reference
            leastCommonPrefix = None
            print "Removed %s" % word
            return None
        else:
            print "This search string is NOT a word"
            return None

    def showWords(self, fromNode = None, prefix = ""):
        """
            Returns a list of all the words held by the Trie
        """
        if fromNode:
            node = fromNode
        else:
            node = self._root
        words = []

        def preOrderTraversal(node,word):
            word += node._key
            if node.isWord:
                words.append(word)               # valid word, add it to the list
            if not node._children:                       # reached the leaf
                word = prefix                            # reset the word with the prefix
            for letter, n in node._children.items():
                preOrderTraversal(n,word)

        preOrderTraversal(node,prefix)
        return words

    def autocomplete(self,wordPrefix):
        """
            Given a word prefix, this fetches all the words held in the
            Trie which start with the prefix
        """
        letters = list(wordPrefix)
        node = self._root

        for letter in letters:
            if letter not in node._children:
                return None
            node = node._children[letter]

        return self.showWords(node,wordPrefix[:-1])

    def tokenize(self, string):
        """
            Given a string tokenize it into words present in the Trie.

            eg: peanutbutter -> peanut butter
                peanutmeg -> pea + nutmeg
        """
        def _tokenize(index, node=None, acc=[]):
            if node is None:
                node = self._root

            if index >= len(string):
                if node.isWord:
                    return acc
                else:
                    return []
            else:
                letter = string[index]
                if letter in node._children:
                    if node.isWord:

                        # be greedy and go for the maximum match ie., favour peanut over pea
                        acc1 = _tokenize(index+1, node=node._children[letter], acc=[letter])
                        if acc1:
                            ## if that recursive branch succeeds then return the result
                            acc.extend(acc1)
                            return acc
                        else:
                            ## because the above recursion failed be less greedy and take
                            ## the next best match path. eg in case of peanutmeg, peanut + meg fails
                            ## so try pea + nutmeg
                            acc.append(" ")
                            return _tokenize(index, node=None, acc=acc)
                    else:
                        acc.append(letter)
                        return _tokenize(index+1, node=node._children[letter], acc=acc)
                else:
                    if node.isWord:
                        acc.append(" ")
                        return _tokenize(index, node=None, acc=acc)
                    else:
                        return []

        acc =  _tokenize(index=0)
        return "".join(acc)


 if __name__ == '__main__':
    #Testing the Trie

    t = Trie()
    t.add('goose')
    t.add('going')
    t.add('porky')
    t.add('poodle')
    t.add('poker')
    t.add('pea')
    t.add('peanut')
    t.add('butter')
    t.add('but')
    t.add('nut')
    t.add('nutmeg')
    print " Trie is created with the following words "
    print t.showWords()

    print "trying ::: tokenization"

    print t.tokenize("porkygoose")

    print t.tokenize("peanutmeg")

    print t.tokenize("peanutbutter")

	class TrieNode(object):
	"""
	Trie Node
	"""

	def __init__(self,key):
	self._key = key
	self._value = None
	self._children = {}
	self._isWord = False

	isWord = property( lambda self: self._isWord,
	lambda self, value: setattr(self,'_isWord',value),
	doc = """ Property to get/set _isWord """)


	class Trie(object):
	"""
	Standard Trie Implementation
	"""

	def __init__(self):
	self._root = TrieNode("")

	def add(self, word, value=None):
	"""
	Add word into the trie with an optional payload
	"""
	letters = list(word)
	node = self._root

	for letter in letters:
	if letter not in node._children:
	node._children[letter] = TrieNode(letter)
	node = node._children[letter]

	node.isWord = True # Flag the word entry
	node._value = value

	def find(self, word):
	"""
	Find a given word in the Trie,
	if payload is present then return the payload
	else return bool for the presence of the word
	"""
	letters = list(word)
	node = self._root

	for letter in letters:
	if letter not in node._children:
	return False
	node = node._children[letter]
	if node.isWord:
	if node._value:
	return node._value
	else:
	return True
	else:
	return False

	def remove(self, word):
	"""
	Remove the word from the Trie
	"""
	letters = list(word)
	node = self._root
	leastCommonPrefix = node
	toDel = letters[0]

	for letter in letters:
	if letter not in node._children:
	print "%s Not Found" % word
	return None

	if len(node._children.keys()) > 1:
	leastCommonPrefix = node
	toDel = letter
	node = node._children[letter]

	if node.isWord:
	node = None
	del leastCommonPrefix._children[toDel] # Delete the reference
	leastCommonPrefix = None
	print "Removed %s" % word
	return None
	else:
	print "This search string is NOT a word"
	return None

	def showWords(self, fromNode = None, prefix = ""):
	"""
	Returns a list of all the words held by the Trie
	"""
	if fromNode:
	node = fromNode
	else:
	node = self._root
	words = []

	def preOrderTraversal(node,word):
	word += node._key
	if node.isWord:
	words.append(word) # valid word, add it to the list
	if not node._children: # reached the leaf
	word = prefix # reset the word with the prefix
	for letter, n in node._children.items():
	preOrderTraversal(n,word)

	preOrderTraversal(node,prefix)
	return words

	def autocomplete(self,wordPrefix):
	"""
	Given a word prefix, this fetches all the words held in the
	Trie which start with the prefix
	"""
	letters = list(wordPrefix)
	node = self._root

	for letter in letters:
	if letter not in node._children:
	return None
	node = node._children[letter]

	return self.showWords(node,wordPrefix[:-1])

	def tokenize(self, string):
	"""
	Given a string tokenize it into words present in the Trie.

	eg: peanutbutter -> peanut butter
	peanutmeg -> pea + nutmeg
	"""
	def _tokenize(index, node=None, acc=[]):
	if node is None:
	node = self._root

	if index >= len(string):
	if node.isWord:
	return acc
	else:
	return []
	else:
	letter = string[index]
	if letter in node._children:
	if node.isWord:

	# be greedy and go for the maximum match ie., favour peanut over pea
	acc1 = _tokenize(index+1, node=node._children[letter], acc=[letter])
	if acc1:
	## if that recursive branch succeeds then return the result
	acc.extend(acc1)
	return acc
	else:
	## because the above recursion failed be less greedy and take
	## the next best match path. eg in case of peanutmeg, peanut + meg fails
	## so try pea + nutmeg
	acc.append(" ")
	return _tokenize(index, node=None, acc=acc)
	else:
	acc.append(letter)
	return _tokenize(index+1, node=node._children[letter], acc=acc)
	else:
	if node.isWord:
	acc.append(" ")
	return _tokenize(index, node=None, acc=acc)
	else:
	return []

	acc = _tokenize(index=0)
	return "".join(acc)


	if __name__ == '__main__':
	#Testing the Trie

	t = Trie()
	t.add('goose')
	t.add('going')
	t.add('porky')
	t.add('poodle')
	t.add('poker')
	t.add('pea')
	t.add('peanut')
	t.add('butter')
	t.add('but')
	t.add('nut')
	t.add('nutmeg')
	print " Trie is created with the following words "
	print t.showWords()

	print "trying ::: tokenization"

	print t.tokenize("porkygoose")

	print t.tokenize("peanutmeg")

	print t.tokenize("peanutbutter")