Created
March 1, 2012 14:53
-
-
Save dvberkel/1950267 to your computer and use it in GitHub Desktop.
Duvals Algorithm for generating Lyndon words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Lyndon.py | |
Algorithms on strings and sequences based on Lyndon words. | |
David Eppstein, October 2011.""" | |
import unittest | |
from Eratosthenes import MoebiusFunction | |
def LengthLimitedLyndonWords(s,n): | |
"""Generate nonempty Lyndon words of length <= n over an s-symbol alphabet. | |
The words are generated in lexicographic order, using an algorithm from | |
J.-P. Duval, Theor. Comput. Sci. 1988, doi:10.1016/0304-3975(88)90113-2. | |
As shown by Berstel and Pocchiola, it takes constant average time | |
per generated word.""" | |
w = [-1] # set up for first increment | |
while w: | |
w[-1] += 1 # increment the last non-z symbol | |
yield w | |
m = len(w) | |
while len(w) < n: # repeat word to fill exactly n syms | |
w.append(w[-m]) | |
while w and w[-1] == s - 1: # delete trailing z's | |
w.pop() | |
def LyndonWordsWithLength(s,n): | |
"""Generate Lyndon words of length exactly n over an s-symbol alphabet. | |
Since nearly half of the outputs of LengthLimitedLyndonWords(s,n) | |
have the desired length, it again takes constant average time per word.""" | |
if n == 0: | |
yield [] # the empty word is a special case not handled by main alg | |
for w in LengthLimitedLyndonWords(s,n): | |
if len(w) == n: | |
yield w | |
def LyndonWords(s): | |
"""Generate all Lyndon words over an s-symbol alphabet. | |
The generation order is by length, then lexicographic within each length.""" | |
n = 0 | |
while True: | |
for w in LyndonWordsWithLength(s,n): | |
yield w | |
n += 1 | |
def DeBruijnSequence(s,n): | |
"""Generate a De Bruijn sequence for words of length n over s symbols | |
by concatenating together in lexicographic order the Lyndon words | |
whose lengths divide n. The output length will be s^n. | |
Because nearly half of the generated sequences will have length | |
exactly n, the algorithm will take O(s^n/n) steps, and the bulk | |
of the time will be spent in sequence concatenation.""" | |
output = [] | |
for w in LengthLimitedLyndonWords(s,n): | |
if n % len(w) == 0: | |
output += w | |
return output | |
def CountLyndonWords(s,n): | |
"""The number of length-n Lyndon words over s symbols.""" | |
if n == 0: | |
return 1 | |
total = 0 | |
for i in range(1,n+1): | |
if n%i == 0: | |
total += MoebiusFunction(n/i) * s**i | |
return total//n | |
def ChenFoxLyndonBreakpoints(s): | |
"""Find starting positions of Chen-Fox-Lyndon decomposition of s. | |
The decomposition is a set of Lyndon words that start at 0 and | |
continue until the next position. 0 itself is not output, but | |
the final breakpoint at the end of s is. The argument s must be | |
of a type that can be indexed (e.g. a list, tuple, or string). | |
The algorithm follows Duval, J. Algorithms 1983, but uses 0-based | |
indexing rather than Duval's choice of 1-based indexing.""" | |
k = 0 | |
while k < len(s): | |
i,j = k,k+1 | |
while j < len(s) and s[i] <= s[j]: | |
i = (s[i] == s[j]) and i+1 or k # Python cond?yes:no syntax | |
j += 1 | |
while k < i+1: | |
k += j-i | |
yield k | |
def ChenFoxLyndon(s): | |
"""Decompose s into Lyndon words according to the Chen-Fox-Lyndon theorem. | |
The arguments are the same as for ChenFoxLyndonBreakpoints but the | |
return values are subsequences of s rather than indices of breakpoints.""" | |
old = 0 | |
for k in ChenFoxLyndonBreakpoints(s): | |
yield s[old:k] | |
old = k | |
def SmallestSuffix(s): | |
"""Find the suffix of s that is smallest in lexicographic order.""" | |
for w in ChenFoxLyndon(s): | |
pass | |
return w | |
def SmallestRotation(s): | |
"""Find the rotation of s that is smallest in lexicographic order. | |
Duval 1983 describes how to modify his algorithm to do so but I think | |
it's cleaner and more general to work from the ChenFoxLyndon output.""" | |
prev,rep = None,0 | |
for w in ChenFoxLyndon(s+s): | |
if w == prev: | |
rep += 1 | |
else: | |
prev,rep = w,1 | |
if len(w)*rep == len(s): | |
return w*rep | |
raise Exception("Reached end of factorization with no shortest rotation") | |
def isLyndonWord(s): | |
"""Is the given sequence a Lyndon word?""" | |
if len(s) == 0: | |
return True | |
return ChenFoxLyndonBreakpoints(s).next() == len(s) | |
# If run standalone, perform unit tests | |
class LyndonTest(unittest.TestCase): | |
def testCount(self): | |
"""Test that we count Lyndon words correctly.""" | |
for s in range(2,7): | |
for n in range(1,6): | |
self.assertEqual(CountLyndonWords(s,n), | |
len(list(LyndonWordsWithLength(s,n)))) | |
def testOrder(self): | |
"""Test that we generate Lyndon words in lexicographic order.""" | |
for s in range(2,7): | |
for n in range(1,6): | |
prev = None | |
for x in LengthLimitedLyndonWords(s,n): | |
self.assert_(prev < x) | |
prev = list(x) | |
def testSubsequence(self): | |
"""Test that words of length n-1 are a subsequence of length n.""" | |
for s in range(2,7): | |
for n in range(2,6): | |
smaller = LengthLimitedLyndonWords(s,n-1) | |
for x in LengthLimitedLyndonWords(s,n): | |
if len(x) < n: | |
self.assertEqual(x,smaller.next()) | |
def testIsLyndon(self): | |
"""Test that the words we generate are Lyndon words.""" | |
for s in range(2,7): | |
for n in range(2,6): | |
for w in LengthLimitedLyndonWords(s,n): | |
self.assertEqual(isLyndonWord(w), True) | |
def testNotLyndon(self): | |
"""Test that words that are not Lyndon words aren't claimed to be.""" | |
nl = sum(1 for i in range(8**4) if isLyndonWord("%04o" % i)) | |
self.assertEqual(nl,CountLyndonWords(8,4)) | |
def testDeBruijn(self): | |
"""Test that the De Bruijn sequence is correct.""" | |
for s in range(2,7): | |
for n in range(1,6): | |
db = DeBruijnSequence(s,n) | |
self.assertEqual(len(db), s**n) | |
db = db + db # duplicate so we can wrap easier | |
subs = set(tuple(db[i:i+n]) for i in range(s**n)) | |
self.assertEqual(len(subs), s**n) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment