Last active
October 30, 2024 04:31
-
-
Save lucaswiman/5cc6584a7c6316998357af1cbdf300ba to your computer and use it in GitHub Desktop.
Fast suffix array construction in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_suffix_array(s: str): | |
""" | |
Implements the "prefix doubling" algorithm. | |
See https://en.wikipedia.org/wiki/Suffix_array | |
""" | |
n = len(s) | |
if n == 0: | |
return [] | |
suffix_arr = list(range(n)) | |
# Rank by character unicode values. j-th rank means "sorts before j+1 | |
# lexicographically". | |
rank = [ord(c) for c in s] | |
smallest = min(rank) | |
rank = [r - smallest for r in rank] | |
k = 1 | |
# Claim: On the jth interation, rank[i] is the rank of the suffix of length | |
# k=2**j at position i. | |
# | |
# Base Case: This is true by construction for k=1 (j=0). | |
def key(i): | |
return (rank[i], rank[i + k] if i + k < n else -1) | |
while k < n: | |
suffix_arr.sort(key=key) | |
# Lexicographically sorting by the suffix s[i:i+2*k] is the same thing | |
# as sorting by (s[i:i+k], s[i+k:(i+k)+k]). By the inductive hypothesis, | |
# this is the same as sorting by (rank[i], rank[i+k]). So suffix_arr is | |
# now sorted by suffixes of length 2*k. | |
# Now update the ranks based on the sort ordering. | |
new_rank = [0] * n | |
# Note that suffix_arr[0] already has the correct rank in new_rank (0). | |
for i in range(1, n): | |
if key(suffix_arr[i-1]) != key(suffix_arr[i]): | |
new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]] + 1 | |
else: | |
new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]] | |
rank = new_rank | |
# `rank` now reflects the ranking for suffixes of length at most 2*k, | |
# so we update k to reflect that. | |
k *= 2 | |
# If 2*k>n, we are done; the suffix array is sorted. | |
return suffix_arr | |
# Test case: | |
def build_suffix_array_naive(s: str): | |
suffixes = [s[i:] for i in range(len(s))] | |
suffixes.sort() | |
return [len(s) - len(suffix) for suffix in suffixes] | |
for word in ("banana", "mississippi"): | |
for i in range(10): | |
text = "".join([word] * i) | |
print(f"{text}") | |
clever = build_suffix_array(text) | |
assert sorted(clever) == list(range(len(text))) | |
naive = build_suffix_array_naive(text) | |
assert sorted(naive) == list(range(len(text))) | |
assert clever == naive |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment