Created
September 14, 2013 20:58
-
-
Save msg555/6565577 to your computer and use it in GitHub Desktop.
Linear time DC3 Suffix Array Construction (And driver program to solve http://uva.onlinejudge.org/index.php?option=onlinejudge&page=show_problem&problem=2507)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string.h> | |
#include <stdio.h> | |
using namespace std; | |
#define MAXN 1010 | |
int AN; | |
int A[3 * MAXN + 100]; | |
int cnt[MAXN + 1]; // Should be >= 256 | |
int SA[MAXN + 1]; | |
/* Used by suffix_array. */ | |
void radix_pass(int* A, int AN, int* R, int RN, int* D) { | |
memset(cnt, 0, sizeof(int) * (AN + 1)); | |
int* C = cnt + 1; | |
for(int i = 0; i < RN; i++) ++C[A[R[i]]]; | |
for(int i = -1, v = 0; i <= AN && v < RN; v += C[i++]) swap(v, C[i]); | |
for(int i = 0; i < RN; i++) D[C[A[R[i]]]++] = R[i]; | |
} | |
/* DC3 in O(N) using 20N bytes of memory. Stores the suffix array of the string | |
* [A,A+AN) into SA where SA[i] (0<=i<=AN) gives the starting position of the | |
* i-th least suffix of A (including the empty suffix). | |
*/ | |
void suffix_array(int* A, int AN) { | |
// Base case... length 1 string. | |
if(!AN) { | |
SA[0] = 0; | |
} else if(AN == 1) { | |
SA[0] = 1; SA[1] = 0; | |
return; | |
} | |
// Sort all strings of length 3 starting at non-multiples of 3 into R. | |
int RN = 0; | |
int* SUBA = A + AN + 2; | |
int* R = SUBA + AN + 2; | |
for(int i = 1; i < AN; i += 3) SUBA[RN++] = i; | |
for(int i = 2; i < AN; i += 3) SUBA[RN++] = i; | |
A[AN + 1] = A[AN] = -1; | |
radix_pass(A + 2, AN - 2, SUBA, RN, R); | |
radix_pass(A + 1, AN - 1, R, RN, SUBA); | |
radix_pass(A + 0, AN - 0, SUBA, RN, R); | |
// Compute the relabel array if we need to recursively solve for the | |
// non-multiples. | |
int resfix, resmul, v; | |
if(AN % 3 == 1) { | |
resfix = 1; resmul = RN >> 1; | |
} else { | |
resfix = 2; resmul = RN + 1 >> 1; | |
} | |
for(int i = v = 0; i < RN; i++) { | |
v += i && (A[R[i - 1] + 0] != A[R[i] + 0] || | |
A[R[i - 1] + 1] != A[R[i] + 1] || | |
A[R[i - 1] + 2] != A[R[i] + 2]); | |
SUBA[R[i] / 3 + (R[i] % 3 == resfix) * resmul] = v; | |
} | |
// Recursively solve if needed to compute relative ranks in the final suffix | |
// array of all non-multiples. | |
if(v + 1 != RN) { | |
suffix_array(SUBA, RN); | |
SA[0] = AN; | |
for(int i = 1; i <= RN; i++) { | |
SA[i] = SA[i] < resmul ? 3 * SA[i] + (resfix==1?2:1) : | |
3 * (SA[i] - resmul) + resfix; | |
} | |
} else { | |
SA[0] = AN; | |
memcpy(SA + 1, R, sizeof(int) * RN); | |
} | |
// Compute the relative ordering of the multiples. | |
int NMN = RN; | |
for(int i = RN = 0; i <= NMN; i++) { | |
if(SA[i] % 3 == 1) { | |
SUBA[RN++] = SA[i] - 1; | |
} | |
} | |
radix_pass(A, AN, SUBA, RN, R); | |
// Compute the reverse SA for what we know so far. | |
for(int i = 0; i <= NMN; i++) { | |
SUBA[SA[i]] = i; | |
} | |
// Merge the orderings. | |
int ii = RN - 1; | |
int jj = NMN; | |
int pos; | |
for(pos = AN; ii >= 0; pos--) { | |
int i = R[ii]; | |
int j = SA[jj]; | |
int v = A[i] - A[j]; | |
if(!v) { | |
if(j % 3 == 1) { | |
v = SUBA[i + 1] - SUBA[j + 1]; | |
} else { | |
v = A[i + 1] - A[j + 1]; | |
if(!v) v = SUBA[i + 2] - SUBA[j + 2]; | |
} | |
} | |
SA[pos] = v < 0 ? SA[jj--] : R[ii--]; | |
} | |
} | |
char s[MAXN + 1]; | |
/* Copies the string in s into A and reduces the characters as needed. */ | |
void prep_string() { | |
int v = AN = 0; | |
memset(cnt, 0, 256 * sizeof(int)); | |
for(char* ss = s; *ss; ++ss, ++AN) cnt[*ss]++; | |
for(int i = 0; i < AN; i++) cnt[s[i]]++; | |
for(int i = 0; i < 256; i++) cnt[i] = cnt[i] ? v++ : -1; | |
for(int i = 0; i < AN; i++) A[i] = cnt[s[i]]; | |
} | |
/* Computes the reverse SA index. REVSA[i] gives the index of the suffix | |
* starting a i in the SA array. In other words, REVSA[i] gives the number of | |
* suffixes before the suffix starting at i. This can be useful in itself but | |
* is also used for compute_lcp(). | |
*/ | |
int REVSA[MAXN + 1]; | |
void compute_reverse_sa() { | |
for(int i = 0; i <= AN; i++) { | |
REVSA[SA[i]] = i; | |
} | |
} | |
/* Computes the longest common prefix between adjacent suffixes. LCP[i] gives | |
* the longest common suffix between the suffix starting at i and the next | |
* smallest suffix. Runs in O(N) time. | |
*/ | |
int LCP[MAXN + 1]; | |
void compute_lcp() { | |
int len = 0; | |
for(int i = 0; i < AN; i++, len = max(0, len - 1)) { | |
int s = REVSA[i]; | |
int j = SA[s - 1]; | |
for(; i + len < AN && j + len < AN && A[i + len] == A[j + len]; len++); | |
LCP[s] = len; | |
} | |
} | |
int main() { | |
int T; cin >> T; | |
for(int t = 1; t <= T; t++) { | |
scanf("%s", s); | |
int AN = strlen(s); | |
prep_string(); | |
suffix_array(A, AN); | |
compute_reverse_sa(); | |
compute_lcp(); | |
char* mxs = NULL; | |
int mxlen = -1; | |
int mxnum = 0; | |
for(int i = 0; i <= AN; i++) { | |
if(LCP[i] > mxlen) { | |
mxlen = LCP[i]; | |
mxs = s + SA[i]; | |
mxnum = 2; | |
while(i + 1 <= AN && LCP[i + 1] == LCP[i]) { | |
mxnum++; | |
i++; | |
} | |
} | |
} | |
if(mxlen == 0) { | |
printf("No repetitions found!\n"); | |
} else { | |
mxs[mxlen] = 0; | |
printf("%s %d\n", mxs, mxnum); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment