Created
September 10, 2013 08:23
-
-
Save istepanov/6506508 to your computer and use it in GitHub Desktop.
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm, written in Python 2.7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 | |
""" | |
Search longest common substrings using generalized suffix trees built with Ukkonen's algorithm | |
Author: Ilya Stepanov <code at ilyastepanov.com> | |
(c) 2013 | |
""" | |
from __future__ import print_function | |
import sys | |
import re | |
import argparse | |
END_OF_STRING = sys.maxint | |
class SuffixTreeNode: | |
""" | |
Suffix tree node class. Actually, it also respresents a tree edge that points to this node. | |
""" | |
new_identifier = 0 | |
def __init__(self, start=0, end=END_OF_STRING): | |
self.identifier = SuffixTreeNode.new_identifier | |
SuffixTreeNode.new_identifier += 1 | |
# suffix link is required by Ukkonen's algorithm | |
self.suffix_link = None | |
# child edges/nodes, each dict key represents the first letter of an edge | |
self.edges = {} | |
# stores reference to parent | |
self.parent = None | |
# bit vector shows to which strings this node belongs | |
self.bit_vector = 0 | |
# edge info: start index and end index | |
self.start = start | |
self.end = end | |
def add_child(self, key, start, end): | |
""" | |
Create a new child node | |
Agrs: | |
key: a char that will be used during active edge searching | |
start, end: node's edge start and end indices | |
Returns: | |
created child node | |
""" | |
child = SuffixTreeNode(start=start, end=end) | |
child.parent = self | |
self.edges[key] = child | |
return child | |
def add_exisiting_node_as_child(self, key, node): | |
""" | |
Add an existing node as a child | |
Args: | |
key: a char that will be used during active edge searching | |
node: a node that will be added as a child | |
""" | |
node.parent = self | |
self.edges[key] = node | |
def get_edge_length(self, current_index): | |
""" | |
Get length of an edge that points to this node | |
Args: | |
current_index: index of current processing symbol (usefull for leaf nodes that have "infinity" end index) | |
""" | |
return min(self.end, current_index + 1) - self.start | |
def __str__(self): | |
return 'id=' + str(self.identifier) | |
class SuffixTree: | |
""" | |
Generalized suffix tree | |
""" | |
def __init__(self): | |
# the root node | |
self.root = SuffixTreeNode() | |
# all strings are concatenaited together. Tree's nodes stores only indices | |
self.input_string = '' | |
# number of strings stored by this tree | |
self.strings_count = 0 | |
# list of tree leaves | |
self.leaves = [] | |
def append_string(self, input_string): | |
""" | |
Add new string to the suffix tree | |
""" | |
start_index = len(self.input_string) | |
current_string_index = self.strings_count | |
# each sting should have a unique ending | |
input_string += '$' + str(current_string_index) | |
# gathering 'em all together | |
self.input_string += input_string | |
self.strings_count += 1 | |
# these 3 variables represents current "active point" | |
active_node = self.root | |
active_edge = 0 | |
active_length = 0 | |
# shows how many | |
remainder = 0 | |
# new leaves appended to tree | |
new_leaves = [] | |
# main circle | |
for index in range(start_index, len(self.input_string)): | |
previous_node = None | |
remainder += 1 | |
while remainder > 0: | |
if active_length == 0: | |
active_edge = index | |
if self.input_string[active_edge] not in active_node.edges: | |
# no edge starting with current char, so creating a new leaf node | |
leaf_node = active_node.add_child(self.input_string[active_edge], index, END_OF_STRING) | |
# a leaf node will always be leaf node belonging to only one string | |
# (because each string has different termination) | |
leaf_node.bit_vector = 1 << current_string_index | |
new_leaves.append(leaf_node) | |
# doing suffix link magic | |
if previous_node is not None: | |
previous_node.suffix_link = active_node | |
previous_node = active_node | |
else: | |
# ok, we've got an active edge | |
next_node = active_node.edges[self.input_string[active_edge]] | |
# walking down through edges (if active_length is bigger than edge length) | |
next_edge_length = next_node.get_edge_length(index) | |
if active_length >= next_node.get_edge_length(index): | |
active_edge += next_edge_length | |
active_length -= next_edge_length | |
active_node = next_node | |
continue | |
# current edge already contains the suffix we need to insert. | |
# Increase the active_length and go forward | |
if self.input_string[next_node.start + active_length] == self.input_string[index]: | |
active_length += 1 | |
if previous_node is not None: | |
previous_node.suffix_link = active_node | |
previous_node = active_node | |
break | |
# splitting edge | |
split_node = active_node.add_child( | |
self.input_string[active_edge], | |
next_node.start, | |
next_node.start + active_length | |
) | |
next_node.start += active_length | |
split_node.add_exisiting_node_as_child(self.input_string[next_node.start], next_node) | |
leaf_node = split_node.add_child(self.input_string[index], index, END_OF_STRING) | |
leaf_node.bit_vector = 1 << current_string_index | |
new_leaves.append(leaf_node) | |
# suffix link magic again | |
if previous_node is not None: | |
previous_node.suffix_link = split_node | |
previous_node = split_node | |
remainder -= 1 | |
# follow suffix link (if exists) or go to root | |
if active_node == self.root and active_length > 0: | |
active_length -= 1 | |
active_edge = index - remainder + 1 | |
else: | |
active_node = active_node.suffix_link if active_node.suffix_link is not None else self.root | |
# update leaves ends from "infinity" to actual string end | |
for leaf in new_leaves: | |
leaf.end = len(self.input_string) | |
self.leaves.extend(new_leaves) | |
def find_longest_common_substrings(self): | |
""" | |
Search longest common substrings in the tree by locating lowest common ancestors what belong to all strings | |
""" | |
# all bits are set | |
success_bit_vector = 2 ** self.strings_count - 1 | |
lowest_common_ancestors = [] | |
# going up to the root | |
for leaf in self.leaves: | |
node = leaf | |
while node.parent is not None: | |
if node.bit_vector != success_bit_vector: | |
# updating parent's bit vector | |
node.parent.bit_vector |= node.bit_vector | |
node = node.parent | |
else: | |
# hey, we've found a lowest common ancestor! | |
lowest_common_ancestors.append(node) | |
break | |
longest_common_substrings = [''] | |
longest_length = 0 | |
# need to filter the result array and get the longest common strings | |
for common_ancestor in lowest_common_ancestors: | |
common_substring = '' | |
node = common_ancestor | |
while node.parent is not None: | |
label = self.input_string[node.start:node.end] | |
common_substring = label + common_substring | |
node = node.parent | |
# remove unique endings ($<number>), we don't need them anymore | |
common_substring = re.sub(r'(.*?)\$?\d*$', r'\1', common_substring) | |
if len(common_substring) > longest_length: | |
longest_length = len(common_substring) | |
longest_common_substrings = [common_substring] | |
elif len(common_substring) == longest_length and common_substring not in longest_common_substrings: | |
longest_common_substrings.append(common_substring) | |
return longest_common_substrings | |
def to_graphviz(self, node=None, output=''): | |
""" | |
Show the tree as graphviz string. For debugging purposes only | |
""" | |
if node is None: | |
node = self.root | |
output = 'digraph G {edge [arrowsize=0.4,fontsize=10];' | |
output +=\ | |
str(node.identifier) + '[label="' +\ | |
str(node.identifier) + '\\n' + '{0:b}'.format(node.bit_vector).zfill(self.strings_count) + '"' | |
if node.bit_vector == 2 ** self.strings_count - 1: | |
output += ',style="filled",fillcolor="red"' | |
output += '];' | |
if node.suffix_link is not None: | |
output += str(node.identifier) + '->' + str(node.suffix_link.identifier) + '[style="dashed"];' | |
for child in node.edges.values(): | |
label = self.input_string[child.start:child.end] | |
output += str(node.identifier) + '->' + str(child.identifier) + '[label="' + label + '"];' | |
output = self.to_graphviz(child, output) | |
if node == self.root: | |
output += '}' | |
return output | |
def __str__(self): | |
return self.to_graphviz() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Searching longest common substring. ' | |
'Uses Ukkonen\'s suffix tree algorithm and generalized suffix tree. ' | |
'Written by Ilya Stepanov (c) 2013' | |
) | |
parser.add_argument( | |
'strings', | |
metavar='STRING', | |
nargs='*', | |
help='String for searching', | |
) | |
parser.add_argument( | |
'-f', | |
'--file', | |
help='Path for input file. First line should contain number of lines to search in' | |
) | |
parser.add_argument( | |
'-d', | |
'--debug', | |
help='Debug mode: shows generalized suffix tree in output (graphviz)', | |
action="store_true" | |
) | |
args = parser.parse_args() | |
if not args.strings and not args.file: | |
parser.print_help() | |
exit() | |
suffix_tree = SuffixTree() | |
for s in args.strings: | |
suffix_tree.append_string(s) | |
if args.file: | |
with open(args.file, 'rU') as f: | |
first_line = f.readline().strip('\r\n') | |
if first_line.isdigit(): | |
max_lines_count = int(first_line) | |
else: | |
max_lines_count = sys.maxint | |
suffix_tree.append_string(first_line) | |
for index, line in enumerate(f): | |
if index >= max_lines_count: | |
break | |
suffix_tree.append_string(line.strip('\r\n')) | |
lcs = suffix_tree.find_longest_common_substrings() | |
for s in lcs: | |
print(s) | |
if args.debug: | |
print(suffix_tree.to_graphviz()) | |
if __name__ == "__main__": | |
main() |
Hi; I am new to coding and was looking for some help. Where in the code do I type in the two strings that I want to compare?
its not correct
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@istepanov what would be the license for this fine code?