Created
March 21, 2013 04:36
-
-
Save msg555/5210712 to your computer and use it in GitHub Desktop.
Suffix Array Implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct suffix_array { | |
suffix_array(const char* S) : N(strlen(S)) { | |
vector<int> V; | |
for(int i = 0; i < N; i++) V.push_back(S[i]); | |
init(V); | |
} | |
suffix_array(const vector<int>& VV) : N(VV.size()) { | |
vector<int> V(VV); | |
init(V); | |
} | |
/* Get longest common prefix between two suffix-indicies. */ | |
int lcp(int si, int sj) { | |
if(sj < si) swap(si, sj); | |
if(si == sj) return N - SA[si]; | |
int len = sj - si; | |
int buck = 31 - __builtin_clz(len); | |
return min(LCP_MRQ[buck][si], LCP_MRQ[buck][sj - (1 << buck)]); | |
} | |
/* Return <j, l> s.t. s[j] = c and s+j+1 shares the longest prefix with s+i. | |
* Returns <0, 0> if no such index exists. */ | |
pair<int, int> backstep(int i, int c) { | |
typeof(BACK_START.begin()) it = BACK_START.find(c); | |
if(it == BACK_START.end()) return make_pair(0, 0); | |
int pi = it->second; ++it; | |
int pj = it == BACK_START.end() ? N : it->second; | |
int si = RA[i]; | |
int p = lower_bound(BACK_LIST.begin() + pi, BACK_LIST.begin() + pj, si) - | |
BACK_LIST.begin(); | |
pair<int, int> r(-1, -1); | |
for(int k = p - 1; k <= p; k++) { | |
if(k < pi || pj <= k) continue; | |
int sj = BACK_LIST[k]; | |
r = max(r, make_pair(1 + lcp(si, sj), SA[sj] - 1)); | |
} | |
return make_pair(r.second, r.first); | |
} | |
/* Returns the <index, match length> of the suffix with the longest shared | |
* prefix with K. Ties broken arbitrarily. */ | |
template<typename T> | |
pair<int, int> find(const T* K, int KN) { | |
int ind = 0, len = 0; | |
for(int i = KN - 1; i >= 0; i--) { | |
pair<int, int> res = backstep(ind, K[i]); | |
ind = res.first; | |
len = min(len + 1, res.second); | |
} | |
return make_pair(ind, len); | |
} | |
/* Figure out what range of ordered suffixes match the prefix s[i:i+dep). Range | |
* is of [) form in terms of suffix-indicies. */ | |
pair<int, int> range(int i, int dep) { | |
pair<int, int> sr(RA[i], RA[i]); | |
for(int j = 31 - __builtin_clz(N + 1); j >= 0; j--) { | |
int su = sr.first - (1 << j); | |
int sv = sr.second + (1 << j); | |
if(su >= 0 && lcp(su, sr.first ) >= dep) sr.first = su; | |
if(sv <= N && lcp(sv, sr.second) >= dep) sr.second = sv; | |
} | |
++sr.second; | |
return sr; | |
} | |
int N; | |
vector<int> SA; | |
vector<int> RA; | |
private: | |
void compress(vector<int>& V, vector<int>& C) { | |
copy(V.begin(), V.end(), C.begin()); | |
sort(C.begin(), C.end()); | |
typeof(C.begin()) cend = unique(C.begin(), C.end()); | |
for(int i = 0; i < N; i++) { | |
V[i] = lower_bound(C.begin(), cend, V[i]) - C.begin() + 1; | |
} | |
V.push_back(0); C.push_back(0); | |
} | |
void compute_sa(vector<int>& V, vector<int>& C) { | |
vector<int> T(N + 1); | |
for(int i = 0; i <= N; i++) SA.push_back(i); | |
for(int ski = 0; V[SA[N]] < N; ski = ski ? ski << 1 : 1) { | |
fill(C.begin(), C.end(), 0); | |
for(int i = 0; i < ski; i++) T[i] = N - i; | |
for(int i = 0, p = ski; i <= N; i++) if(SA[i] >= ski) T[p++] = SA[i] - ski; | |
for(int i = 0; i <= N; i++) C[V[i]]++; | |
for(int i = 1; i <= N; i++) C[i] += C[i - 1]; | |
for(int i = N; i >= 0; i--) SA[--C[V[T[i]]]] = T[i]; | |
T[SA[0]] = 0; | |
for(int j = 1; j <= N; j++) { | |
int a = SA[j]; | |
int b = SA[j - 1]; | |
T[a] = T[b] + (a + ski >= N || b + ski >= N || | |
V[a] != V[b] || V[a + ski] != V[b + ski]); | |
} | |
V.swap(T); | |
} | |
} | |
void compute_lcp(const vector<int>& OV) { | |
LCP_MRQ.push_back(vector<int>(N)); | |
int len = 0; | |
for(int i = 0; i < N; i++, len = max(0, len - 1)) { | |
int si = RA[i]; | |
int j = SA[si - 1]; | |
for(; i + len < N && j + len < N && OV[i + len] == OV[j + len]; len++); | |
LCP_MRQ[0][si - 1] = len; | |
} | |
for(int i = 1; 1 << i <= N; i++) { | |
LCP_MRQ.push_back(vector<int>()); | |
const vector<int>& plcp = LCP_MRQ[i - 1]; | |
for(int j = 0; j + (1 << i) <= N; j++) { | |
LCP_MRQ[i].push_back(min(plcp[j], plcp[j + (1 << i - 1)])); | |
} | |
} | |
} | |
void compute_backsteps(const vector<int>& OV) { | |
int add = 0; | |
for(int i = 0; i < OV.size(); i++) ++BACK_START[OV[i]]; | |
for(typeof(BACK_START.begin()) it = BACK_START.begin(); | |
it != BACK_START.end(); it++) { | |
add += it->second; | |
it->second = add; | |
} | |
BACK_LIST.resize(N); | |
for(int si = N; si >= 0; si--) { | |
int i = SA[si] - 1; | |
if(i >= 0) BACK_LIST[--BACK_START[OV[i]]] = si; | |
} | |
} | |
void init(vector<int>& V) { | |
vector<int> OV(V); | |
vector<int> C(N); | |
compress(V, C); | |
compute_sa(V, C); | |
RA.resize(N + 1); | |
for(int i = 0; i <= N; i++) RA[SA[i]] = i; | |
compute_lcp(OV); | |
compute_backsteps(OV); | |
} | |
vector<vector<int> > LCP_MRQ; | |
map<int, int> BACK_START; | |
vector<int> BACK_LIST; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment