Last active
February 22, 2020 13:48
-
-
Save dipu-bd/bebb1aae8a87d64d60cb338600f4fec3 to your computer and use it in GitHub Desktop.
Suffix array implementation using Radix Sort. Complexity: O(n.log(n))
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*================================== | |
Title: Suffix Array using Radix Sort | |
Complexity: O(n.log(n)) | |
Author : Sudipto Chandra (Dipu) | |
===================================*/ | |
#include <bits/stdc++.h> | |
using namespace std; | |
#define mem(a,b) memset(a, b, sizeof(a)) | |
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i) | |
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i) | |
/*------------------------------------------------------------------------------------*/ | |
void print(int k, const char* title); | |
bool test = false; | |
const int SIZ = 10000050; // maximum possible size | |
int n; // text length | |
char T[SIZ]; // text string | |
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes | |
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array | |
int L[SIZ]; // used in counting sort | |
inline int getRA(int i) | |
{ | |
return (i < n) ? RA[i] : 0; | |
} | |
void radix_sort(int k) | |
{ | |
mem(L, 0); | |
// count frequencies | |
for(int i = 0; i < n; ++i) | |
{ | |
L[getRA(i + k)]++; | |
} | |
// save first index of every characters | |
int mx = max(n, 130); | |
for(int i = 0, s = 0; i < mx; ++i) | |
{ | |
int x = L[i]; | |
L[i] = s; | |
s += x; | |
} | |
// build sorted tempSA | |
for(int i = 0; i < n; ++i) | |
{ | |
int& x = L[getRA(SA[i] + k)]; | |
tempSA[x++] = SA[i]; | |
} | |
// copy tempSA to SA | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = tempSA[i]; | |
} | |
} | |
// text must ends with a $ | |
void buildSA() | |
{ | |
// initialize | |
n = strlen(T); | |
T[n++] = '$', T[n] = 0; // append $ | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = i; | |
RA[i] = T[i]; | |
} | |
if(!test) print(1, "Initialized:"); | |
// algorithm loop | |
for(int k = 1; k < n; k <<= 1) | |
{ | |
// sort by k-th ranks | |
radix_sort(k); | |
radix_sort(0); | |
if(!test) print(k, "After sorting:"); | |
// compute new ranks | |
tempRA[SA[0]] = 0; | |
for(int i = 1, r = 0; i < n; ++i) | |
{ | |
if(getRA(SA[i-1]) != getRA(SA[i])) { | |
r++; | |
} | |
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) { | |
r++; | |
} | |
tempRA[SA[i]] = r; | |
} | |
for(int i = 0; i < n; ++i) | |
{ | |
RA[i] = tempRA[i]; | |
} | |
if(!test) print(k, "New ranks:"); | |
if(RA[SA[n - 1]] == n - 1) break; | |
} | |
} | |
void print(int k, const char* title = "") | |
{ | |
if(title[0]) printf("%s\n", title); | |
puts("========================================================"); | |
printf("| i | SA[i] | RA[SA[i]] | RA[SA[i] +%2d] | tempRA[i] |\n", k); | |
printf("|----|-------|-----------|---------------|------------|\n"); | |
for(int i = 0; i < n; ++i) | |
{ | |
printf("| %2d | ", i); | |
printf(" %3d | ", SA[i]); | |
printf(" %5d | ", getRA(SA[i])); | |
printf(" %7d | ", getRA(SA[i]+k)); | |
printf(" %4d |\n", tempRA[SA[i]]); | |
} | |
puts("========================================================"); | |
cin.get(); | |
} | |
void RUN() | |
{ | |
printf("Text: "); | |
gets(T); | |
buildSA(); | |
} | |
void TEST() | |
{ | |
test = true; | |
int values[] = { | |
10, | |
100, | |
1000, | |
10000, | |
50000, | |
100000, | |
500000, | |
1000000, | |
2000000, | |
3000000, | |
4000000, | |
5000000, | |
6000000, | |
7000000, | |
8000000 | |
}; | |
int siz = sizeof(values) / sizeof(int); | |
double avg_cpi = 0; | |
puts(""); | |
puts("| n | Runtime(s) | TPI(ms) |"); | |
puts("|----------:|:----------:|:------------:|"); | |
for(int k = 0; k < siz; ++k) | |
{ | |
int n = values[k]; | |
for(int i = 0; i < n; ++i) | |
{ | |
if(rand() & 1) | |
{ | |
T[i] = 'A' + (rand() % 26); | |
} | |
else if(rand() & 1) | |
{ | |
T[i] = 'a' + (rand() % 26); | |
} | |
else | |
{ | |
T[i] = '0' + (rand() % 10); | |
} | |
} | |
T[n] = 0; | |
time_t start = clock(); | |
buildSA(); // builds the suffix array | |
time_t stop = clock(); | |
double time = (double)(stop - start) / CLOCKS_PER_SEC; | |
double cpi = (double)(stop - start) / (n * log2(n)); | |
printf("| `%7d` | `%5.3f` | `%0.8f` |\n", n, time, cpi); | |
if(k) avg_cpi += (values[k] - values[k - 1]) * cpi; | |
else avg_cpi += values[k] * cpi; | |
} | |
avg_cpi /= values[siz - 1]; | |
printf("\n"); | |
printf("**Average *Time Per Instructions*** = `%.10f ms`\n", avg_cpi); | |
} | |
int main() | |
{ | |
//RUN(); | |
TEST(); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Performance measures
1. Core i7-4790U @3.6GHz + 8GB RAM @1600MHz
10
0.020
0.60205999
100
0.024
0.03612360
1000
0.025
0.00250858
10000
0.030
0.00022577
50000
0.038
0.00004869
100000
0.045
0.00002709
500000
0.114
0.00001204
1000000
0.377
0.00001891
2000000
1.031
0.00002463
3000000
2.816
0.00004363
4000000
4.012
0.00004573
5000000
5.333
0.00004793
6000000
6.813
0.00005043
7000000
7.802
0.00004902
8000000
9.172
0.00005000
Average Time Per Instructions =
0.0000428121 ms
2 Core i5-3210M @2.5GHz + 4GB RAM @1600MHz
10
0.033
0.99339899
100
0.037
0.05569055
1000
0.031
0.00311064
10000
0.048
0.00036124
50000
0.050
0.00006406
100000
0.069
0.00004154
500000
0.298
0.00003148
1000000
0.773
0.00003878
2000000
1.701
0.00004063
3000000
3.851
0.00005966
4000000
5.115
0.00005831
5000000
6.559
0.00005895
6000000
8.224
0.00006087
7000000
9.457
0.00005941
8000000
11.006
0.00005999
Average Time Per Instructions =
0.0000569310 ms
3. Core i3-4005U @1.7GHz + 4GB RAM @1333Mhz
10
0.035
1.05360498
100
0.046
0.06923690
1000
0.047
0.00471614
10000
0.061
0.00045907
50000
0.087
0.00011147
100000
0.081
0.00004877
500000
0.306
0.00003233
1000000
0.966
0.00004847
2000000
2.202
0.00005260
3000000
5.420
0.00008397
4000000
6.988
0.00007966
5000000
8.622
0.00007749
6000000
11.067
0.00008192
7000000
12.607
0.00007920
8000000
13.722
0.00007480
Average Time Per Instructions =
0.0000748545 ms