Skip to content

Instantly share code, notes, and snippets.

@lucaswiman
Last active October 30, 2024 04:31
Show Gist options
  • Save lucaswiman/5cc6584a7c6316998357af1cbdf300ba to your computer and use it in GitHub Desktop.
Save lucaswiman/5cc6584a7c6316998357af1cbdf300ba to your computer and use it in GitHub Desktop.
Fast suffix array construction in python
def build_suffix_array(s: str):
"""
Implements the "prefix doubling" algorithm.
See https://en.wikipedia.org/wiki/Suffix_array
"""
n = len(s)
if n == 0:
return []
suffix_arr = list(range(n))
# Rank by character unicode values. j-th rank means "sorts before j+1
# lexicographically".
rank = [ord(c) for c in s]
smallest = min(rank)
rank = [r - smallest for r in rank]
k = 1
# Claim: On the jth interation, rank[i] is the rank of the suffix of length
# k=2**j at position i.
#
# Base Case: This is true by construction for k=1 (j=0).
def key(i):
return (rank[i], rank[i + k] if i + k < n else -1)
while k < n:
suffix_arr.sort(key=key)
# Lexicographically sorting by the suffix s[i:i+2*k] is the same thing
# as sorting by (s[i:i+k], s[i+k:(i+k)+k]). By the inductive hypothesis,
# this is the same as sorting by (rank[i], rank[i+k]). So suffix_arr is
# now sorted by suffixes of length 2*k.
# Now update the ranks based on the sort ordering.
new_rank = [0] * n
# Note that suffix_arr[0] already has the correct rank in new_rank (0).
for i in range(1, n):
if key(suffix_arr[i-1]) != key(suffix_arr[i]):
new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]] + 1
else:
new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]]
rank = new_rank
# `rank` now reflects the ranking for suffixes of length at most 2*k,
# so we update k to reflect that.
k *= 2
# If 2*k>n, we are done; the suffix array is sorted.
return suffix_arr
# Test case:
def build_suffix_array_naive(s: str):
suffixes = [s[i:] for i in range(len(s))]
suffixes.sort()
return [len(s) - len(suffix) for suffix in suffixes]
for word in ("banana", "mississippi"):
for i in range(10):
text = "".join([word] * i)
print(f"{text}")
clever = build_suffix_array(text)
assert sorted(clever) == list(range(len(text)))
naive = build_suffix_array_naive(text)
assert sorted(naive) == list(range(len(text)))
assert clever == naive
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment