Created
July 11, 2011 15:49
-
-
Save nojima/1076138 to your computer and use it in GitHub Desktop.
Suffix Array Construction and String Search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <climits> | |
#include <cstdlib> | |
#include <cstring> | |
#include <vector> | |
using namespace std; | |
struct Comp1 { | |
const char* str; | |
Comp1(const char* str): str(str) {} | |
bool operator()(int s1, int s2) const { | |
return strcmp(str+s1, str+s2) < 0; | |
} | |
}; | |
void build_suffix_array1(const char* A, /*OUT*/ vector<int>& S) { | |
int n = strlen(A); | |
S.resize(n+1); | |
for (int i = 0; i < n+1; ++i) { S[i] = i; } | |
sort(S.begin(), S.end(), Comp1(A)); | |
} | |
struct Comp2 { | |
int h; | |
const vector<int>& group; | |
Comp2(int h, const vector<int>& group): h(h), group(group) {} | |
bool operator()(int s1, int s2) const { | |
return group[s1+h] < group[s2+h]; | |
} | |
}; | |
void build_suffix_array2(const char* A, /*OUT*/ vector<int>& S) { | |
typedef unsigned char uchar; | |
int n = strlen(A); | |
vector<int> group(n+1), next(n+2), bucket(256); | |
S.resize(n+1); | |
for (int i = 0; i < n+1; ++i) { ++bucket[(uchar)A[i]]; } | |
for (int i = 1; i < 256; ++i) { bucket[i] += bucket[i-1]; } | |
for (int i = 0; i < n+1; ++i) { S[--bucket[(uchar)A[i]]] = i; } | |
for (int i = 0; i < n+1; ++i) { group[i] = bucket[(uchar)A[i]]; } | |
for (int i = 0; i < 255; ++i) { next[bucket[i]] = bucket[i+1]; } | |
next[bucket[255]] = n+1; | |
for (int h = 1; next[0] != -n-1; h *= 2) { | |
int prev = -1; | |
for (int i = 0; i < n+1; i = abs(next[i])) { | |
if (next[i]-i <= 1) { | |
if (prev == -1) { prev = i; } | |
else { next[prev] = -i; } | |
} else { | |
prev = -1; | |
} | |
} | |
if (prev != -1) { next[prev] = -n-1; } | |
Comp2 comp(h, group); | |
for (int i = 0; i < n+1; ) { | |
if (next[i] < 0) { | |
i = -next[i]; | |
} else { | |
sort(S.begin()+i, S.begin()+next[i], comp); | |
int prev = i, nxt = next[i]; | |
for (int j = i+1; j < nxt; ++j) { | |
if (comp(S[j-1], S[j])) { next[prev] = j; prev = j; } | |
} | |
i = next[prev] = nxt; | |
} | |
} | |
for (int i = 0; i < n+1; i = abs(next[i])) { | |
for (int j = i; j < next[i]; ++j) { group[S[j]] = i; } | |
} | |
} | |
} | |
void build_lcp1(const char* A, const vector<int>& S, | |
/*OUT*/ vector<int>& height) { | |
int n = S.size() - 1; | |
height = vector<int>(n+1); | |
for (int i = 1; i < n+1; ++i) { | |
while (A[S[i-1]+height[i]] == A[S[i]+height[i]]) { ++height[i]; } | |
} | |
} | |
void build_lcp2(const char* A, const vector<int>& S, | |
/*OUT*/ vector<int>& height) { | |
int n = S.size() - 1, h = 0; | |
vector<int> rank(n+1); | |
height = vector<int>(n+1); | |
for (int i = 0; i < n+1; ++i) { rank[S[i]] = i; } | |
for (int i = 0; i < n+1; ++i) { | |
if (rank[i] > 0) { | |
int j = S[rank[i]-1]; | |
while (A[i+h] == A[j+h]) { ++h; } | |
height[rank[i]] = h; | |
} | |
if (h > 0) { --h; } | |
} | |
} | |
void build_rmq(const vector<int>& array, | |
/*OUT*/ vector<int>& rmq) { | |
int n = array.size(), N = 1; | |
while (N <= n) { N *= 2; } | |
rmq = vector<int>(2*N-1, INT_MAX); | |
copy(array.begin(), array.end(), rmq.begin()+N-1); | |
for (int v = N-2; v >= 0; --v) { | |
rmq[v] = min(rmq[2*v+1], rmq[2*v+2]); | |
} | |
} | |
int rmq_min(const vector<int>& rmq, int v, | |
int Lv, int Rv, int Lq, int Rq) { | |
int mid = (Lv+Rv)/2, ret = INT_MAX; | |
if (Rq < mid) { return rmq_min(rmq, 2*v+1, Lv, mid, Lq, Rq); } | |
if (Lq > mid) { return rmq_min(rmq, 2*v+2, mid, Rv, Lq, Rq); } | |
if (Lq < mid) { | |
if (Lq == Lv) { ret = min(ret, rmq[2*v+1]); } | |
else { ret = min(ret, rmq_min(rmq, 2*v+1, Lv, mid, Lq, mid)); } | |
} | |
if (Rq > mid) { | |
if (Rq == Rv) { ret = min(ret, rmq[2*v+2]); } | |
else { ret = min(ret, rmq_min(rmq, 2*v+2, mid, Rv, mid, Rq)); } | |
} | |
return ret; | |
} | |
int rmq_min(const vector<int>& rmq, int L, int R) { | |
int n = (rmq.size()+1)/2; | |
if (L >= R) { return INT_MAX; } | |
if (L == 0 && R == n) { return rmq[0]; } | |
return rmq_min(rmq, 0, 0, n, L, R); | |
} | |
struct Comp { | |
const char* A; | |
Comp(const char* A): A(A) {} | |
bool operator()(int s, const char* w) const { | |
return strcmp(A+s, w) < 0; | |
} | |
}; | |
bool search0(const char* W, const char* A) { | |
for (int i = 0; A[i]; ++i) { | |
int j; | |
for (j = 0; W[j]; ++j) { | |
if (W[j] != A[i+j]) { break; } | |
} | |
if (W[j] == 0) { return true; } | |
} | |
return false; | |
} | |
bool search1(const char* W, const char* A, const vector<int>& S) { | |
return strncmp(A+*lower_bound(S.begin(), S.end(), W, Comp(A)), | |
W, strlen(W)) == 0; | |
} | |
inline int suffix_suffix_lcp(int i, int j, const vector<int>& rmq) { | |
return rmq_min(rmq, i+1, j+1); | |
} | |
inline int suffix_string_lcp(int s_i, int p, const char* W, | |
int n, const char* A) { | |
int j = 0; | |
while (j < p && A[s_i + j] == W[j]) { ++j; } | |
return j; | |
} | |
bool search2(const char* W, const char* A, | |
const vector<int>& S, const vector<int>& rmq) { | |
int p = strlen(W), n = S.size()-1; | |
int L = 0, R = n; | |
int l = suffix_string_lcp(S[L], p, W, n, A); | |
int r = suffix_string_lcp(S[R], p, W, n, A); | |
while (R - L > 1) { | |
int M = (L+R)/2; | |
if (l >= r) { | |
int m = suffix_suffix_lcp(L, M, rmq); | |
if (m < l) { R = M; r = m; } | |
else if (m > l) { L = M; } | |
else { | |
int k = l + suffix_string_lcp(S[M]+l, p-l, W+l, n, A); | |
if (k == p || A[S[M]+k] < W[k]) { L = M; l = k; } | |
else { R = M; r = k; } | |
} | |
} else { | |
int m = suffix_suffix_lcp(M, R, rmq); | |
if (m < r) { L = M; l = m; } | |
else if (m > r) { R = M; } | |
else { | |
int k = r + suffix_string_lcp(S[M]+r, p-r, W+r, n, A); | |
if (k == p || A[S[M]+k] < W[k]) { L = M; l = k; } | |
else { R = M; r = k; } | |
} | |
} | |
} | |
return l == p || r == p; | |
} | |
//// TEST ///////////////////////////////////////////////////////////////////// | |
#include <cstdio> | |
void test_build_suffix_array() { | |
printf("testing build_suffix_array()..."); | |
fflush(stdout); | |
bool ok = true; | |
char str[1024]; | |
for (int t = 0; t < 10000; ++t) { | |
int n = rand() % 100; | |
int p = rand() % 254 + 1; | |
for (int i = 0; i < n; ++i) { | |
str[i] = rand() % p + 1; | |
} | |
str[n] = 0; | |
vector<int> expected; | |
build_suffix_array1(str, expected); | |
vector<int> answer; | |
build_suffix_array2(str, answer); | |
if (expected != answer) { | |
printf("FAILED!! t=%d, n=%d, p=%d\n", t, n, p); | |
ok = false; | |
} | |
} | |
for (int t = 0; t < 1000; ++t) { | |
int n = t, c = rand() % 254 + 1; | |
for (int i = 0; i < n; ++i) { str[i] = c; } | |
str[n] = 0; | |
vector<int> expected; | |
build_suffix_array1(str, expected); | |
vector<int> answer; | |
build_suffix_array2(str, answer); | |
if (expected != answer) { | |
printf("FAILED!! t=%d, n=%d\n", t, n); | |
ok = false; | |
} | |
} | |
if (ok) { puts("OK!!"); } | |
} | |
void test_build_lcp() { | |
printf("testing build_lcp()..."); | |
fflush(stdout); | |
bool ok = true; | |
char str[1024]; | |
for (int t = 0; t < 10000; ++t) { | |
int n = rand() % 100; | |
int p = rand() % 254 + 1; | |
for (int i = 0; i < n; ++i) { | |
str[i] = rand() % p + 1; | |
} | |
str[n] = 0; | |
vector<int> sa; | |
build_suffix_array2(str, sa); | |
vector<int> expected; | |
build_lcp1(str, sa, expected); | |
vector<int> answer; | |
build_lcp2(str, sa, answer); | |
if (expected != answer) { | |
printf("FAILED!! t=%d, n=%d, p=%d\n", t, n, p); | |
ok = false; | |
} | |
} | |
for (int t = 0; t < 1000; ++t) { | |
int n = rand() % 100, c = rand() % 254 + 1; | |
for (int i = 0; i < n; ++i) { str[i] = c; } | |
str[n] = 0; | |
vector<int> sa; | |
build_suffix_array2(str, sa); | |
vector<int> expected; | |
build_lcp1(str, sa, expected); | |
vector<int> answer; | |
build_lcp2(str, sa, answer); | |
if (expected != answer) { | |
printf("FAILED!! t=%d, n=%d\n", t, n); | |
ok = false; | |
} | |
} | |
if (ok) { puts("OK!!"); } | |
} | |
const int N = 1000; | |
char str[N+1]; | |
void test_search() { | |
printf("testing search()..."); | |
fflush(stdout); | |
bool ok = true; | |
for (int T = 0; T < 300; ++T) { | |
for (int i = 0; i < N; ++i) { str[i] = rand() % 8 + 'a'; } | |
str[N] = 0; | |
vector<int> sa, height, rmq; | |
build_suffix_array2(str, sa); | |
build_lcp2(str, sa, height); | |
build_rmq(height, rmq); | |
for (int t = 0; t < 10000; ++t) { | |
char w[256]; | |
int n = rand() % 10 + 1; | |
for (int i = 0; i < n; ++i) { w[i] = rand() % 8 + 'a'; } | |
w[n] = 0; | |
bool expected = search1(w, str, sa); | |
bool answer = search2(w, str, sa, rmq); | |
if (expected != answer) { | |
printf("FAILED!! T=%d, t=%d, n=%d, expected=%d, answer=%d\n", T, t, n, (int)expected, (int)answer); | |
ok = false; | |
} | |
} | |
} | |
for (int T = 0; T < 8; ++T) { | |
int c = 'a'+T; | |
for (int i = 0; i < N; ++i) { str[i] = c; } | |
str[N] = 0; | |
vector<int> sa, height, rmq; | |
build_suffix_array2(str, sa); | |
build_lcp2(str, sa, height); | |
build_rmq(height, rmq); | |
for (int t = 0; t < 10000; ++t) { | |
char w[256]; | |
int n = rand() % 10 + 1; | |
for (int i = 0; i < n; ++i) { w[i] = rand() % 8 + 'a'; } | |
w[n] = 0; | |
bool expected = search1(w, str, sa); | |
bool answer = search2(w, str, sa, rmq); | |
if (expected != answer) { | |
printf("FAILED!! T=%d, t=%d, n=%d, expected=%d, answer=%d\n", T, t, n, (int)expected, (int)answer); | |
ok = false; | |
} | |
} | |
} | |
if (ok) { puts("OK!!"); } | |
} | |
int main() { | |
srand(12345); | |
test_build_suffix_array(); | |
test_build_lcp(); | |
test_search(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment