Skip to content

Instantly share code, notes, and snippets.

@jogojapan
Created June 18, 2012 06:57
Show Gist options
  • Save jogojapan/2947210 to your computer and use it in GitHub Desktop.
Save jogojapan/2947210 to your computer and use it in GitHub Desktop.
Ukkonen algorithm
import sys
class Suxtree:
# The ID of the node most recently inserted. 0 is the ID of the
# root node; we assume it exists right from the start.
max_node = 0
# The edges hash. Each edge has a unique ID (consisting of a node
# ID and an initial character. The ID is constructed by the
# edgeid() method below). The hash below maps edge IDs to edge
# triples (FROM,TO,DEST_NODE). If the edge leads to a leaf node,
# TO will be -1 and DEST_NODE will be None.
edges = {}
# A hash representing suffix links. Node IDs are keys, IDs of
# target nodes are values.
suxlinks = {}
def __init__(self,text):
# The input test, for which the suffix tree is constructed.
self.text = text
# The current end position. Leaf nodes are automatically
# interpreted at carrying a label that ends here.
self.endpos = 0
# The active point
self.AP = (0,None,0)
# Construct suffix tree.
self.build()
def edgeid(self,node,initial):
return (node << 32) + ord(initial)
def invedgeid(self,edge_id):
return ((edge_id & 0xFFFFFFFF00000000) >> 32,(edge_id & 0xFFFFFFFF00000000))
def edgelabel(self,edge_id):
(edge_start,edge_end,dest) = self.edges[edge_id]
if edge_end < 0:
edge_end = self.endpos
return buffer(self.text,edge_start,edge_end)
def is_implicit(self):
"""
Checks whether the character at self.endpos is present at the
active point. If yes, the new active point is returned,
otherwise None is returned.
"""
(node,edge,length) = self.AP
endchar = self.text[self.endpos - 1]
if length == 0:
edge_id = self.edgeid(node,endchar)
if edge_id in self.edges:
(efrom,eto,dest) = self.edges[edge_id]
if (eto - efrom) == 1:
return (dest,None,0)
else:
return (node,endchar,1)
else:
return None
else:
edge_id = self.edgeid(node,edge)
(efrom,eto,dest) = self.edges[edge_id]
if endchar == self.text[efrom + length]:
if (eto - efrom) == length + 1:
return (dest,None,0)
else:
return (node,edge,length+1)
else:
return None
def edgesplit(self,edge_id,active_length):
"""
Splits the edge denoted by edge_id into a prefix (the existing
label up to the active point) and a suffix (the remaining
characters). Introduces a new internal node between prefix and
suffix, and adds another outgoing edge from the new node to a
leaf. The label of the new edge starts at (endpos - 1),
i.e. it includes only the final character of the suffix
currently being inserted.
Returns the new internal node.
"""
(edge_start,edge_end,dest) = self.edges[edge_id]
self.max_node += 1
self.edges[edge_id] = (edge_start,edge_start + active_length,self.max_node)
new_edge_id1 = self.edgeid(self.max_node,self.text[edge_start + active_length])
self.edges[new_edge_id1] = (edge_start + active_length,edge_end,dest)
new_edge_id2 = self.edgeid(self.max_node,self.text[self.endpos - 1])
self.edges[new_edge_id2] = (self.endpos - 1,-1,None)
return self.max_node
def insert(self):
"""
Inserts the final character of the text at the current step's
end position. If that causes an edge split, the new internal
node will be returned.
"""
(node,edge,active_length) = self.AP
if active_length == 0:
edge_id = self.edgeid(node,self.text[self.endpos - 1])
self.edges[edge_id] = (self.endpos - 1,-1,None)
return None
else:
edge_id = self.edgeid(node,edge)
return self.edgesplit(edge_id,active_length)
def canonicize(self):
"""
Adjusts the active point after it moved down one step along
the suffix chain.
"""
(aNode,aEdge,aLength) = self.AP
sys.stderr.write('Canonicize from (%d,%s,%d).\n' % self.AP)
edge_id = self.edgeid(aNode,aEdge)
(eFrom,eTo,eDest) = self.edges[edge_id]
if eTo < 0:
eTo = self.endpos
while aLength >= (eTo - eFrom):
sys.stderr.write('aLength %d, (eTo - eFrom) %d\n' %
(aLength,(eTo - eFrom)))
newLength = aLength - (eTo - eFrom)
if newLength == 0:
self.AP = (eDest,None,0)
return
sys.stderr.write('newLength %d\n' % newLength)
aEdge = self.text[self.endpos - newLength - 1]
aLength = newLength
aNode = eDest
edge_id = self.edgeid(aNode,aEdge)
(eFrom,eTo,eDest) = self.edges[edge_id]
if eTo < 0:
break
self.AP = (aNode,aEdge,aLength)
def build(self):
self.endpos = 0
remainder = 0
# The internal node most recently created.
prev_node = None
while self.endpos < len(self.text):
# We move forward the current position by 1 and update all
# leaves implicitly:
self.endpos += 1
# We increase remainder because the newest 1-character
# suffix has not been inserted yet.
remainder += 1
sys.stderr.write('==============\nITERATION #%d\n' % self.endpos)
# We deal with all remaining suffixes (i.e. those that
# have been inserted *implicitly* in previous steps, and
# the 1-character one of the current step).
while remainder > 0:
sys.stderr.write('Remainder %d\n' % remainder)
sys.stderr.write('AP (%d,%s,%d)\n' % self.AP)
newAP = self.is_implicit()
sys.stderr.write(' AP (%s,%s,%d)\n' % (newAP,self.AP[1],self.AP[2]))
if newAP is None:
sys.stderr.write('Not implicit\n')
# Case I: The current suffix is *not* implicictly
# contained. We need to make an insert.
new_node = self.insert()
remainder -= 1
if prev_node is not None and new_node is not None:
sys.stderr.write('Setting suxlink (A) from %d to %d\n' % (prev_node,new_node))
self.suxlinks[prev_node] = new_node
prev_node = new_node
(aNode,aEdge,aLength) = self.AP
if aNode == 0:
# If the insert was made from root, we reduce
# remainder and active-length and make the
# next insert from root, too.
if remainder == 0:
self.AP = (0,None,0)
else:
self.AP = (0,self.text[self.endpos - remainder],aLength - 1)
else:
# If the insert was made from an internal
# node, we follow its outgoing suffix link to
# identify the new active point.
if aNode in self.suxlinks:
sys.stderr.write('Follow suxlink from %d to %d.\n'
% (aNode,self.suxlinks[aNode]))
self.AP = (self.suxlinks[aNode],aEdge,aLength)
else:
sys.stderr.write('Go to root.\n')
self.AP = (0,aEdge,aLength)
if aLength > 0:
self.canonicize()
else:
# Case II: The current suffix *is* implicitly
# contained in the tree. We create a suffix link
# from the most recently created internal node and
# update the active point. The current step ends
# then.
(newNode,newEdge,newLength) = newAP
if prev_node is not None and newNode != 0:
if prev_node != newNode:
sys.stderr.write('Setting suxlink (B) from %d to %d\n' % (prev_node,newNode))
self.suxlinks[prev_node] = newNode
prev_node = None
self.AP = newAP
# No further remaining suffixes need to be
# inserted. They must all be implicitly contained.
break
def dot(self,outstream):
outstream.write('digraph {\n rankdir = LR;\n edge [arrowsize=0.4,fontsize=10];\n' +
' int0 [label="",style=filled,fillcolor=yellow,shape=circle,width=.1,height=.1];\n')
for node in range(1,self.max_node + 1):
outstream.write(
' int%d [label="",style=filled,fillcolor=lightgrey,shape=circle,width=.07,height=.07]\n' \
% node)
outstream.write('\n')
max_leaf = 0
for edge_id,edge in self.edges.items():
(fromNode,edgeChar) = self.invedgeid(edge_id)
(edgeFrom,edgeTo,toNode) = edge
if toNode is None:
edgeLabel = buffer(self.text,edgeFrom,self.endpos)
outstream.write(' leaf%d [label="",shape=point]\n' % max_leaf)
outstream.write(' int%d -> leaf%d [label="%s",weight=3]\n' %
(fromNode,max_leaf,edgeLabel))
max_leaf += 1
else:
edgeLabel = buffer(self.text,edgeFrom,edgeTo - edgeFrom)
outstream.write(' int%d -> int%d [label="%s",weight=3]\n' %
(fromNode,toNode,edgeLabel))
outstream.write('}\n')
if __name__ == '__main__':
tree = Suxtree(sys.argv[1])
tree.dot(sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment