Created
January 12, 2018 02:30
-
-
Save hynekcer/fa340f3b63826168ffc0c4b33310ae9c to your computer and use it in GitHub Desktop.
fast longest common substring - by suffix array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Find the longest repeated substring. | |
"Efficient way to find longest duplicate string for Python (From Programming Pearls)" | |
http://stackoverflow.com/questions/13560037/ | |
The algorithm is based on "Prefix doubling". | |
The worst time complexity is O(n (log n)^2). Memory requirements are linear. | |
""" | |
import time | |
from random import randint | |
import itertools | |
import sys | |
import unittest | |
from itertools import groupby | |
from operator import itemgetter | |
import logging | |
log = logging.getLogger(__name__) | |
log.setLevel(logging.INFO) | |
try: | |
log.addHandler(logging.NullHandler()) | |
except AttributeError: | |
pass | |
def run(): | |
if sys.argv[1:] == ['-']: | |
text = sys.stdin.read() | |
elif sys.argv[1:]: | |
print('Reading data...') | |
text = open(sys.argv[1]).read() | |
else: | |
text = 'banana' | |
print('Sorting...') | |
result = longest_common_substring(text) | |
print('Longest common substrings in "{0}..." are:\n{1}'.format( | |
text[:20], result)) | |
def longest_common_substring(text): | |
"""Get the longest common substrings and their positions. | |
>>> longest_common_substring('banana') | |
{'ana': [1, 3]} | |
>>> text = "not so Agamemnon, who spoke fiercely to " | |
>>> sorted(longest_common_substring(text).items()) | |
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])] | |
This function can be easy modified for any criteria, e.g. for searching ten | |
longest non overlapping repeated substrings. | |
""" | |
sa, rsa, lcp = suffix_array(text) | |
maxlen = max(lcp) | |
result = {} | |
for i in range(1, len(text)): | |
if lcp[i] == maxlen: | |
j1, j2, h = sa[i - 1], sa[i], lcp[i] | |
assert text[j1:j1 + h] == text[j2:j2 + h] | |
substring = text[j1:j1 + h] | |
if substring not in result: | |
result[substring] = [j1] | |
result[substring].append(j2) | |
return dict((k, sorted(v)) for k, v in result.items()) | |
def suffix_array(text, _step=16): | |
"""Analyze all common strings in the text. | |
Short substrings of the length _step a are first pre-sorted. The are the | |
results repeatedly merged so that the garanteed number of compared | |
characters bytes is doubled in every iteration until all substrings are | |
sorted exactly. | |
Arguments: | |
text: The text to be analyzed. | |
_step: Is only for optimization and testing. It is the optimal length | |
of substrings used for initial pre-sorting. The bigger value is | |
faster if there is enough memory. Memory requirements are | |
approximately (estimate for 32 bit Python 3.3): | |
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB | |
Return value: (tuple) | |
(sa, rsa, lcp) | |
sa: Suffix array for i in range(1, size): | |
assert text[sa[i-1]:] < text[sa[i]:] | |
rsa: Reverse suffix array for i in range(size): | |
assert rsa[sa[i]] == i | |
lcp: Longest common prefix for i in range(1, size): | |
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]] | |
if sa[i-1] + lcp[i] < len(text): | |
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]] | |
>>> suffix_array(text='banana') | |
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' | |
The Longest Common String is 'ana': lcp[2] == 3 == len('ana') | |
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:] | |
""" | |
tx = text | |
t0 = time.time() | |
size = len(tx) | |
step = min(max(_step, 1), len(tx)) | |
sa = list(range(len(tx))) | |
log.debug('%6.3f pre sort', time.time() - t0) | |
sa.sort(key=lambda i: tx[i:i + step]) | |
log.debug('%6.3f after sort', time.time() - t0) | |
grpstart = size * [False] + [True] # a boolean map for iteration speedup. | |
# It helps to skip yet resolved values. The last value True is a sentinel. | |
rsa = size * [None] | |
stgrp, igrp = '', 0 | |
for i, pos in enumerate(sa): | |
st = tx[pos:pos + step] | |
if st != stgrp: | |
grpstart[igrp] = (igrp < i - 1) | |
stgrp = st | |
igrp = i | |
rsa[pos] = igrp | |
sa[i] = pos | |
grpstart[igrp] = (igrp < size - 1 or size == 0) | |
log.debug('%6.3f after group', time.time() - t0) | |
while grpstart.index(True) < size: | |
# assert step <= size | |
nmerge = 0 | |
nextgr = grpstart.index(True) | |
while nextgr < size: | |
igrp = nextgr | |
nextgr = grpstart.index(True, igrp + 1) | |
glist = [] | |
for ig in range(igrp, nextgr): | |
pos = sa[ig] | |
if rsa[pos] != igrp: | |
break | |
newgr = rsa[pos + step] if pos + step < size else -1 | |
glist.append((newgr, pos)) | |
glist.sort() | |
for ig, g in groupby(glist, key=itemgetter(0)): | |
g = [x[1] for x in g] | |
sa[igrp:igrp + len(g)] = g | |
grpstart[igrp] = (len(g) > 1) | |
for pos in g: | |
rsa[pos] = igrp | |
igrp += len(g) | |
nmerge += len(glist) | |
log.debug('%6.3f for step=%d nmerge=%d', time.time() - t0, step, nmerge) | |
step *= 2 | |
del grpstart | |
# create LCP array | |
lcp = size * [None] | |
h = 0 | |
for i in range(size): | |
if rsa[i] > 0: | |
j = sa[rsa[i] - 1] | |
while i != size - h and j != size - h and tx[i + h] == tx[j + h]: | |
h += 1 | |
lcp[rsa[i]] = h | |
if h > 0: | |
h -= 1 | |
if size > 0: | |
lcp[0] = 0 | |
log.debug('%6.3f end', time.time() - t0) | |
return sa, rsa, lcp | |
# --- | |
class TestMixin(object): | |
def suffix_verify(self, text, step=16): | |
tx = text | |
sa, rsa, lcp = suffix_array(text=tx, _step=step) | |
self.assertEqual(set(sa), set(range(len(tx)))) | |
ok = True | |
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]): | |
self.assertEqual(tx[i1:i1 + h], tx[i0:i0 + h], "Verify LCP characters equal on text '%s...'" % text[:20]) | |
self.assertGreater(tx[i1 + h:i1 + h + 1], tx[i0 + h:i0 + h + 1], | |
"Verify LCP+1 char is different '%s...'" % text[:20]) | |
self.assertLessEqual(max(i0, i1), len(tx) - h, | |
"Verify LCP is not more than length of string '%s...'" % text[:20]) | |
self.assertTrue(ok) | |
class SuffixArrayTest(unittest.TestCase, TestMixin): | |
def test_16(self): | |
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' | |
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
self.assertEqual(suffix_array(text='banana', _step=16), expect) | |
def test_1(self): | |
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
self.assertEqual(suffix_array(text='banana', _step=1), expect) | |
def test_mini(self): | |
self.assertEqual(suffix_array(text='', _step=1), ([], [], [])) | |
self.assertEqual(suffix_array(text='a', _step=1), ([0], [0], [0])) | |
self.assertEqual(suffix_array(text='aa', _step=1), ([1, 0], [1, 0], [0, 1])) | |
self.assertEqual(suffix_array(text='aaa', _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2])) | |
def test_example(self): | |
self.suffix_verify('abracadabra') | |
def test_cartesian(self): | |
"""Test all combinations of alphabet "ABC" up to length 4 characters""" | |
for size in range(7): | |
for cartesian in itertools.product(*(size * ['ABC'])): | |
text = ''.join(cartesian) | |
log.debug('Testing "%s"', text) | |
self.suffix_verify(text, 1) | |
def test_lcp(self): | |
expect = {'ana': [1, 3]} | |
self.assertDictEqual(longest_common_substring('banana'), expect) | |
expect = {' s': [3, 21], 'no': [0, 13], 'o ': [5, 20, 38]} | |
self.assertDictEqual(longest_common_substring( | |
"not so Agamemnon, who spoke fiercely to "), expect) | |
class SlowTests(unittest.TestCase, TestMixin): | |
"""Slow development tests running many minutes. | |
It can be run only by an EXPLICIT command! | |
e.g.: python -m unittest maxsubstring.SlowTests._test_random | |
""" | |
def _test_random(self): | |
for power in range(2, 21, 2): | |
size = randint(2 ** (power - 1), 2 ** power) | |
for alphabet in (2, 4, 16, 256): | |
text = ''.join(chr(65 + randint(0, alphabet - 1)) for _ in range(size)) | |
log.debug('%s %s %s', size, alphabet, 1) | |
self.suffix_verify(text, 1) | |
log.debug('%s %s %s', size, alphabet, 16) | |
self.suffix_verify(text, 16) | |
if __name__ == '__main__': | |
run() |
@likengmx Imagine that you have a text "ABABABAB". The longest common subtring is "ABABAB" for strings at the offset 0 and 2. With LCP = 6. but the non overlapping part is not longer than the difference of offsets.
I admit that it is not so trivial, because the longest substring is between offsets 0 and 4, that are not lexicographically neighboring.
Please read also the previous answer today to other user. Excuse for late answer.
@likengmx I have written an ugly little function that does what you probably want:
def LRS(text, m=3, n=10): # Longest repeated non-overlapping substrings
"""
Returns longest (maximum n number of) non-overlapping (minimum m times)
repeated strings and how many times they are found in the text in
descending order of count*len(substring)^2.
"""
sa, rsa, lcp = suffix_array(text)
result = {}
for i in range(0, len(sa)):
l = lcp[i]
if l < 2: # Don't bother with individual characters.
continue
substring = text[sa[i]:sa[i] + l]
c = text.count(substring) # This is the bottleneck.
if c < m: # Not found enough times.
continue
halt = False
for s in list(result.keys()): # Filter out overlaps.
if len(s) >= l:
if substring in s:
halt = True
break
elif s in substring:
del result[s]
if not halt:
result[substring] = c
return dict(sorted(result.items(),
key=lambda x: x[1]*(len(x[0])**2), reverse=True)[:n])
With this, we can for example look around in relatively large text files or code, and look for duplicates, prioritising long and common ones. Might be a nice feature to add to the search window in Notepad++
... 😃
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@lwanchia You run
python maxsubstring.py input_text_file
on the command-lineYou can get more different repeated strings of the same maximal length and every of them can exist more than twice. Therefore you get a dictionary where you get the string and a list of offsets where the string can be found: {'first_longest_repeated_text': [offset_1. offset_2], ...}
It is a demo program for a question on Effcient way to find longest duplicate string for Python (on SO).
The function
run()
is a simple input /output. The functionlongest_common_substring()
is the most interesting code to read, if you want to modify it. The arraylcp
(Longest Common Prefix) is the common length. The arraysa
(Suffix Array) are the offsets of alphabetically sorted strings from the position to the end of text. Imagine that you have a very long string that contains only million uppercase letters without spaces or other characters. Imagine that the last character of the text is "A". At the beginning you find offsets of the final "A" because string is before one character "A". Very soon continue strings like "AAA..." and at the end you find offsets of strings like "ZZZ..." Every neighbouring twins of string is compared exactly until a difference is found or the end of the shorter string is found. Maybe more than half million characters must be compared if the common part is very long. In thelcp
array you find how many initial characters are the equal between every sorted nearest strings.