lucaswiman · October 30, 2024 04:31
diff --git a/prefix-doubling-suffix-array.py b/prefix-doubling-suffix-array.py
 def build_suffix_array(s: str):
    """
    Implements the "prefix doubling" algorithm.

    See https://en.wikipedia.org/wiki/Suffix_array
    """
    n = len(s)
    if n == 0:
        return []
    suffix_arr = list(range(n))

    # Rank by character unicode values. j-th rank means "sorts before j+1
    # lexicographically".
    rank = [ord(c) for c in s]
    smallest = min(rank)
    rank = [r - smallest for r in rank]

    k = 1
    # Claim: On the jth interation, rank[i] is the rank of the suffix of length
    #        k=2**j at position i.
    #
    # Base Case: This is true by construction for k=1 (j=0).

    def key(i):
        return (rank[i], rank[i + k] if i + k < n else -1)

    while k < n:
        suffix_arr.sort(key=key)
        # Lexicographically sorting by the suffix s[i:i+2*k] is the same thing
        # as sorting by (s[i:i+k], s[i+k:(i+k)+k]). By the inductive hypothesis,
        # this is the same as sorting by (rank[i], rank[i+k]). So suffix_arr is
        # now sorted by suffixes of length 2*k.

        # Now update the ranks based on the sort ordering.
        new_rank = [0] * n

        # Note that suffix_arr[0] already has the correct rank in new_rank (0).
        for i in range(1, n):
            if key(suffix_arr[i-1]) != key(suffix_arr[i]):
                new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]] + 1
            else:
                new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]]
        rank = new_rank
        # `rank` now reflects the ranking for suffixes of length at most 2*k,
        # so we update k to reflect that.
        k *= 2
        # If 2*k>n, we are done; the suffix array is sorted.
    return suffix_arr



 # Test case:
 def build_suffix_array_naive(s: str):
    suffixes = [s[i:] for i in range(len(s))]
    suffixes.sort()
    return [len(s) - len(suffix) for suffix in suffixes]


 for word in ("banana", "mississippi"):
    for i in range(10):
        text = "".join([word] * i)
        print(f"{text}")
        clever = build_suffix_array(text)
        assert sorted(clever) == list(range(len(text)))
        naive = build_suffix_array_naive(text)
        assert sorted(naive) == list(range(len(text)))
        assert clever == naive
	def build_suffix_array(s: str):
	"""
	Implements the "prefix doubling" algorithm.

	See https://en.wikipedia.org/wiki/Suffix_array
	"""
	n = len(s)
	if n == 0:
	return []
	suffix_arr = list(range(n))

	# Rank by character unicode values. j-th rank means "sorts before j+1
	# lexicographically".
	rank = [ord(c) for c in s]
	smallest = min(rank)
	rank = [r - smallest for r in rank]

	k = 1
	# Claim: On the jth interation, rank[i] is the rank of the suffix of length
	# k=2**j at position i.
	#
	# Base Case: This is true by construction for k=1 (j=0).

	def key(i):
	return (rank[i], rank[i + k] if i + k < n else -1)

	while k < n:
	suffix_arr.sort(key=key)
	# Lexicographically sorting by the suffix s[i:i+2*k] is the same thing
	# as sorting by (s[i:i+k], s[i+k:(i+k)+k]). By the inductive hypothesis,
	# this is the same as sorting by (rank[i], rank[i+k]). So suffix_arr is
	# now sorted by suffixes of length 2*k.

	# Now update the ranks based on the sort ordering.
	new_rank = [0] * n

	# Note that suffix_arr[0] already has the correct rank in new_rank (0).
	for i in range(1, n):
	if key(suffix_arr[i-1]) != key(suffix_arr[i]):
	new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]] + 1
	else:
	new_rank[suffix_arr[i]] = new_rank[suffix_arr[i-1]]
	rank = new_rank
	# `rank` now reflects the ranking for suffixes of length at most 2*k,
	# so we update k to reflect that.
	k *= 2
	# If 2*k>n, we are done; the suffix array is sorted.
	return suffix_arr



	# Test case:
	def build_suffix_array_naive(s: str):
	suffixes = [s[i:] for i in range(len(s))]
	suffixes.sort()
	return [len(s) - len(suffix) for suffix in suffixes]


	for word in ("banana", "mississippi"):
	for i in range(10):
	text = "".join([word] * i)
	print(f"{text}")
	clever = build_suffix_array(text)
	assert sorted(clever) == list(range(len(text)))
	naive = build_suffix_array_naive(text)
	assert sorted(naive) == list(range(len(text)))
	assert clever == naive