Created
June 18, 2012 06:57
-
-
Save jogojapan/2947210 to your computer and use it in GitHub Desktop.
Ukkonen algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
class Suxtree: | |
# The ID of the node most recently inserted. 0 is the ID of the | |
# root node; we assume it exists right from the start. | |
max_node = 0 | |
# The edges hash. Each edge has a unique ID (consisting of a node | |
# ID and an initial character. The ID is constructed by the | |
# edgeid() method below). The hash below maps edge IDs to edge | |
# triples (FROM,TO,DEST_NODE). If the edge leads to a leaf node, | |
# TO will be -1 and DEST_NODE will be None. | |
edges = {} | |
# A hash representing suffix links. Node IDs are keys, IDs of | |
# target nodes are values. | |
suxlinks = {} | |
def __init__(self,text): | |
# The input test, for which the suffix tree is constructed. | |
self.text = text | |
# The current end position. Leaf nodes are automatically | |
# interpreted at carrying a label that ends here. | |
self.endpos = 0 | |
# The active point | |
self.AP = (0,None,0) | |
# Construct suffix tree. | |
self.build() | |
def edgeid(self,node,initial): | |
return (node << 32) + ord(initial) | |
def invedgeid(self,edge_id): | |
return ((edge_id & 0xFFFFFFFF00000000) >> 32,(edge_id & 0xFFFFFFFF00000000)) | |
def edgelabel(self,edge_id): | |
(edge_start,edge_end,dest) = self.edges[edge_id] | |
if edge_end < 0: | |
edge_end = self.endpos | |
return buffer(self.text,edge_start,edge_end) | |
def is_implicit(self): | |
""" | |
Checks whether the character at self.endpos is present at the | |
active point. If yes, the new active point is returned, | |
otherwise None is returned. | |
""" | |
(node,edge,length) = self.AP | |
endchar = self.text[self.endpos - 1] | |
if length == 0: | |
edge_id = self.edgeid(node,endchar) | |
if edge_id in self.edges: | |
(efrom,eto,dest) = self.edges[edge_id] | |
if (eto - efrom) == 1: | |
return (dest,None,0) | |
else: | |
return (node,endchar,1) | |
else: | |
return None | |
else: | |
edge_id = self.edgeid(node,edge) | |
(efrom,eto,dest) = self.edges[edge_id] | |
if endchar == self.text[efrom + length]: | |
if (eto - efrom) == length + 1: | |
return (dest,None,0) | |
else: | |
return (node,edge,length+1) | |
else: | |
return None | |
def edgesplit(self,edge_id,active_length): | |
""" | |
Splits the edge denoted by edge_id into a prefix (the existing | |
label up to the active point) and a suffix (the remaining | |
characters). Introduces a new internal node between prefix and | |
suffix, and adds another outgoing edge from the new node to a | |
leaf. The label of the new edge starts at (endpos - 1), | |
i.e. it includes only the final character of the suffix | |
currently being inserted. | |
Returns the new internal node. | |
""" | |
(edge_start,edge_end,dest) = self.edges[edge_id] | |
self.max_node += 1 | |
self.edges[edge_id] = (edge_start,edge_start + active_length,self.max_node) | |
new_edge_id1 = self.edgeid(self.max_node,self.text[edge_start + active_length]) | |
self.edges[new_edge_id1] = (edge_start + active_length,edge_end,dest) | |
new_edge_id2 = self.edgeid(self.max_node,self.text[self.endpos - 1]) | |
self.edges[new_edge_id2] = (self.endpos - 1,-1,None) | |
return self.max_node | |
def insert(self): | |
""" | |
Inserts the final character of the text at the current step's | |
end position. If that causes an edge split, the new internal | |
node will be returned. | |
""" | |
(node,edge,active_length) = self.AP | |
if active_length == 0: | |
edge_id = self.edgeid(node,self.text[self.endpos - 1]) | |
self.edges[edge_id] = (self.endpos - 1,-1,None) | |
return None | |
else: | |
edge_id = self.edgeid(node,edge) | |
return self.edgesplit(edge_id,active_length) | |
def canonicize(self): | |
""" | |
Adjusts the active point after it moved down one step along | |
the suffix chain. | |
""" | |
(aNode,aEdge,aLength) = self.AP | |
sys.stderr.write('Canonicize from (%d,%s,%d).\n' % self.AP) | |
edge_id = self.edgeid(aNode,aEdge) | |
(eFrom,eTo,eDest) = self.edges[edge_id] | |
if eTo < 0: | |
eTo = self.endpos | |
while aLength >= (eTo - eFrom): | |
sys.stderr.write('aLength %d, (eTo - eFrom) %d\n' % | |
(aLength,(eTo - eFrom))) | |
newLength = aLength - (eTo - eFrom) | |
if newLength == 0: | |
self.AP = (eDest,None,0) | |
return | |
sys.stderr.write('newLength %d\n' % newLength) | |
aEdge = self.text[self.endpos - newLength - 1] | |
aLength = newLength | |
aNode = eDest | |
edge_id = self.edgeid(aNode,aEdge) | |
(eFrom,eTo,eDest) = self.edges[edge_id] | |
if eTo < 0: | |
break | |
self.AP = (aNode,aEdge,aLength) | |
def build(self): | |
self.endpos = 0 | |
remainder = 0 | |
# The internal node most recently created. | |
prev_node = None | |
while self.endpos < len(self.text): | |
# We move forward the current position by 1 and update all | |
# leaves implicitly: | |
self.endpos += 1 | |
# We increase remainder because the newest 1-character | |
# suffix has not been inserted yet. | |
remainder += 1 | |
sys.stderr.write('==============\nITERATION #%d\n' % self.endpos) | |
# We deal with all remaining suffixes (i.e. those that | |
# have been inserted *implicitly* in previous steps, and | |
# the 1-character one of the current step). | |
while remainder > 0: | |
sys.stderr.write('Remainder %d\n' % remainder) | |
sys.stderr.write('AP (%d,%s,%d)\n' % self.AP) | |
newAP = self.is_implicit() | |
sys.stderr.write(' AP (%s,%s,%d)\n' % (newAP,self.AP[1],self.AP[2])) | |
if newAP is None: | |
sys.stderr.write('Not implicit\n') | |
# Case I: The current suffix is *not* implicictly | |
# contained. We need to make an insert. | |
new_node = self.insert() | |
remainder -= 1 | |
if prev_node is not None and new_node is not None: | |
sys.stderr.write('Setting suxlink (A) from %d to %d\n' % (prev_node,new_node)) | |
self.suxlinks[prev_node] = new_node | |
prev_node = new_node | |
(aNode,aEdge,aLength) = self.AP | |
if aNode == 0: | |
# If the insert was made from root, we reduce | |
# remainder and active-length and make the | |
# next insert from root, too. | |
if remainder == 0: | |
self.AP = (0,None,0) | |
else: | |
self.AP = (0,self.text[self.endpos - remainder],aLength - 1) | |
else: | |
# If the insert was made from an internal | |
# node, we follow its outgoing suffix link to | |
# identify the new active point. | |
if aNode in self.suxlinks: | |
sys.stderr.write('Follow suxlink from %d to %d.\n' | |
% (aNode,self.suxlinks[aNode])) | |
self.AP = (self.suxlinks[aNode],aEdge,aLength) | |
else: | |
sys.stderr.write('Go to root.\n') | |
self.AP = (0,aEdge,aLength) | |
if aLength > 0: | |
self.canonicize() | |
else: | |
# Case II: The current suffix *is* implicitly | |
# contained in the tree. We create a suffix link | |
# from the most recently created internal node and | |
# update the active point. The current step ends | |
# then. | |
(newNode,newEdge,newLength) = newAP | |
if prev_node is not None and newNode != 0: | |
if prev_node != newNode: | |
sys.stderr.write('Setting suxlink (B) from %d to %d\n' % (prev_node,newNode)) | |
self.suxlinks[prev_node] = newNode | |
prev_node = None | |
self.AP = newAP | |
# No further remaining suffixes need to be | |
# inserted. They must all be implicitly contained. | |
break | |
def dot(self,outstream): | |
outstream.write('digraph {\n rankdir = LR;\n edge [arrowsize=0.4,fontsize=10];\n' + | |
' int0 [label="",style=filled,fillcolor=yellow,shape=circle,width=.1,height=.1];\n') | |
for node in range(1,self.max_node + 1): | |
outstream.write( | |
' int%d [label="",style=filled,fillcolor=lightgrey,shape=circle,width=.07,height=.07]\n' \ | |
% node) | |
outstream.write('\n') | |
max_leaf = 0 | |
for edge_id,edge in self.edges.items(): | |
(fromNode,edgeChar) = self.invedgeid(edge_id) | |
(edgeFrom,edgeTo,toNode) = edge | |
if toNode is None: | |
edgeLabel = buffer(self.text,edgeFrom,self.endpos) | |
outstream.write(' leaf%d [label="",shape=point]\n' % max_leaf) | |
outstream.write(' int%d -> leaf%d [label="%s",weight=3]\n' % | |
(fromNode,max_leaf,edgeLabel)) | |
max_leaf += 1 | |
else: | |
edgeLabel = buffer(self.text,edgeFrom,edgeTo - edgeFrom) | |
outstream.write(' int%d -> int%d [label="%s",weight=3]\n' % | |
(fromNode,toNode,edgeLabel)) | |
outstream.write('}\n') | |
if __name__ == '__main__': | |
tree = Suxtree(sys.argv[1]) | |
tree.dot(sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment