Last active
August 31, 2016 09:29
-
-
Save htfy96/7907c056968258a3cece0b55df12fbb6 to your computer and use it in GitHub Desktop.
suffix array with comment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <algorithm> | |
#include <string> | |
#include <vector> | |
#include <climits> | |
using namespace std; | |
/* | |
* Suffix array is a useful data structure which stores | |
* suffix of a string in alphabetical order. | |
* (https://en.wikipedia.org/wiki/Suffix_array) | |
* | |
* For example: | |
* raw = abaab | |
* Suffix: abaab, baab, aab, ab, b | |
* Sorted suffix: aab, ab, abaab, b, baab . | |
* Null character is considered less than any other character. | |
* | |
* Suffix array of raw: | |
* [2, 3, 0, 4, 1] | |
* | |
* because suffix starting from position 2(aab) is the smallest, then | |
* comes suffix starting from position 3(ab), ..., and suffix starting | |
* from position 1 (baab) is the largest | |
*/ | |
/* | |
* This algorithm builds suffix array in O(nlogn) time complexity | |
* and O(CHAR_MAX + n) space complexity, based on sorting segments | |
* with length 1, 2, 4, 8, ..., until all segments are different, | |
* which is called "doubling algorithm" | |
* | |
* ====================================== | |
* | |
* Still take "abaab" as an example: | |
* | |
* Iteration 1 | |
* -------------------- | |
* We sort all segments with length 1 in 'raw'(i.e. a b a a b), and relabel the original string | |
* x = 01001 // 'a' => 0, 'b' => 1. two different segments. still have repeated segments, continue iteration | |
* sa = 02314 // because letter 'a' appears at 0, 2, 3, then letter 'b' at 1, 4 | |
* Here, x[i] stands for the order of raw[i] | |
* | |
* Iteration 2 [Step = 1] | |
* -------------------- | |
* We sort all segments with length 2 in 'raw'(i.e. 01 10 00 01 1$), and relabel it | |
* x = 13012 // '00' => 0, 01 => 1, 1$ => 2, 10 => 3. 4 different segments. still have repeated segments, continue iteration. | |
* sa = 20341 // 00 at position 2, 01 at position 0 and 3, 1$ at position 4, 10 at position 1 | |
* | |
* Here, x[i] stands for the order of raw[i...i+1] | |
* | |
* Iteration 3 [Step = 2] | |
* -------------------- | |
* We sort all segments with length 4 in 'x'(i.e. 10 31 02 1$ 2$, | |
* because if we want x[i] stands for raw[i...i+3], then we need to let the concatenation of | |
* old_x[i](which stands for raw[i...i+1]) and old_x[i+2](which stands for raw[i+2...i+3]) become the | |
* key of length-4 segment) | |
* | |
* x = 24013 // 02 => 0, 1$ => 1, 10 => 2, 2$ => 3, 31 => 4. All different, done! | |
* sa = 23041 // 02 at position 2, 1$ at position 3, 10 at position 0, 2$ at position 4, 31 at position 1 | |
* | |
* Here, x[i] stands for the order of raw[i...i+3] of all length-4 substrings of raw | |
* | |
* ========================================== | |
* In each iteration, we need sort key pairs (x[i], x[i + step]) (0 <= i < n) | |
* The naive method is to use quicksort, thus we need to use O(nlogn) for each iteration, | |
* and O(nlognlogn) for the whole algorithm. | |
* | |
* A better way is to look into the range of x[i]/raw[i]: the maximum is bounded by | |
* both the length of raw (n) [since our goal is to make each elements of x different] and | |
* the character set size CHAR_MAX [for raw[i]]. Therefore, we can use bucket sort. | |
* | |
* For 2-keyword bucket sort, the basic idea is: | |
* 1) sort elements based on the 2nd keyword | |
* 2) sort elements based on the 1st keyword (must be stable) | |
* | |
* ========================================== | |
* How can we perform a stable bukkit sort? | |
* | |
* // Assume origin is [1, 2, 2, 0, 0] | |
* for (int item : origin) | |
* bukkit[item]++; | |
* // bukkit: [2, 1, 2] | |
* for (size_t i=1; i<bukkit_size; ++i) | |
* bukkit[i] += bukkit[i-1]; | |
* // bukkit: [2, 3, 5] | |
* for (int i = origin.size() - 1; i >= 0; --i) | |
* result[ --bukkit[ origin[i] ] ] = i | |
* // result: [3, 4, 0, 1, 2] | |
* ========================================== | |
* | |
* So how can we get the 2nd keyword? Remember that the 2nd keyword for position i is x[i+step] (if i + step < N) | |
* or $ (if i + step >= N). | |
* | |
* Take a look at the result of the 2nd iteration: | |
* x = 13012 | |
* and the key pairs to sort of the 3rd iteration: | |
* 10 31 02 1$ 2$, | |
* 2nd keyword: | |
* 0 1 2 $ $ | |
* | |
* ========================================== | |
* | |
* But the result of sort based on the 2nd keyword is already done! | |
* Sa of iteration 2: | |
* sa = 23041 | |
* | |
* new_sa == { N - step, N - step + 1, ..., N - 1, sa[origin1] - step, sa[origin2] - step, ... } | |
* the smallest "step" elements must be the last "step" elements, because the second keyword of them is always $. | |
* Sort result of 2nd keyword in iteration 3: | |
* new_sa= 3 4 0 1 2. | |
* | |
* step = 2, thus 3 4 are the smallest elements. position 0, 1 are impossible for a second keyword: | |
* ignore them from sa, the remaining items in sa (2 3 4) is the order of 2nd keywords of position (0 1 2) | |
* | |
*/ | |
vector<int> build_sa(const string& raw) | |
{ | |
vector<int> pool(max(size_t(CHAR_MAX), raw.size())); | |
vector<int> sa(raw.size()); | |
string x(raw.size() * 2 , '\0'); // trick to avoid x[i + step] outbounds | |
// The next 6 lines are stable bukkit sort(keyword=raw[i], result: sa and x) | |
for (size_t i=0; i<raw.size(); ++i) | |
++pool[x[i]=raw[i]]; | |
for (size_t i=1; i< pool.size(); ++i) | |
pool[i] += pool[i-1]; | |
for (int i=raw.size() - 1; i>=0; --i) | |
sa[--pool[raw[i]]] = i; | |
for (int step = 1; step < raw.size(); step *=2) | |
{ | |
cout << "Step="<<step << " "; | |
cout << endl; | |
for (int i=0; i<raw.size(); ++i) | |
cout << sa[i] << " "; | |
vector<int> new_sa; | |
// the 2nd keyword of position N - step, ... must be smallest | |
for (size_t i=raw.size() - step; i < raw.size(); ++i) | |
new_sa.push_back(i); | |
// Then, for each elements that could be 2nd keyword, keep its original order with position sa[i] - step | |
for (size_t i=0; i< raw.size(); ++i) | |
if (sa[i] >= step) | |
new_sa.push_back(sa[i] - step); | |
for (int i=0; i<raw.size(); ++i) | |
cout << new_sa[i] << " "; | |
cout << endl; | |
fill(pool.begin(), pool.end(), 0); | |
// The next 9 lines perform stable bukkit sort(keyword: x[new_sa[i]] [the first keyword], result: sa) | |
for (size_t i=0; i<new_sa.size(); ++i) | |
{ | |
cout << "Adding to pool " << x[new_sa[i]] << endl; | |
pool[x[new_sa[i]]]++; | |
} | |
for (size_t i=1; i<pool.size(); ++i) | |
pool[i] += pool[i-1]; | |
for (int i=new_sa.size() - 1; i>=0; --i) | |
sa[--pool[x[new_sa[i]]]] = new_sa[i]; | |
for (int i=0; i<new_sa.size(); ++i) | |
cout << sa[i] << " "; | |
cout << endl; | |
// To rebuild x from sa | |
// Need decide the number based on whether two key pairs are identical, | |
// Two identical key pairs (10, 10) should be assigned with the same number in next iteration | |
string newx = x; | |
int p=0; | |
newx[sa[0]] = 0; | |
for (int i=1; i<sa.size(); ++i) | |
newx[sa[i]] = x[sa[i]] == x[sa[i-1]] && x[sa[i] + step] == x[sa[i-1]+step] ? p : ++p; | |
x = newx; | |
for (size_t i=0; i<raw.size(); ++i) | |
cout << x[i] << endl; | |
cout << endl; | |
// All different, go away | |
if (p == raw.size() - 1) break; | |
} | |
return sa; | |
} | |
int main() | |
{ | |
string s; | |
cin >> s; | |
vector<int> sa = build_sa(s); | |
for (size_t i=0; i<sa.size(); ++i) | |
cout << sa[i] << " "; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment