Last active
June 29, 2017 11:13
-
-
Save dipu-bd/1df39b57b3b23957ba3abd432d97c36a to your computer and use it in GitHub Desktop.
Finding Longest Common Prefix (LCP) using Suffix Array. Complexity: SA=O(n.log(n)), LCP=O(n)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*==================================================================== | |
Title: Compute LCP from Suffix Array using Radix Sort | |
Suffix Array: O(n.log(n)) | |
Compute LCP: O(n) | |
Author : Sudipto Chandra (Dipu) | |
=====================================================================*/ | |
#include <bits/stdc++.h> | |
using namespace std; | |
#define mem(a,b) memset(a, b, sizeof(a)) | |
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i) | |
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i) | |
/*------------------------------------------------------------------------------------*/ | |
int test, cas = 1; | |
const int SIZ = 10000050; // maximum possible size | |
int n; // text length | |
char T[SIZ]; // text | |
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes | |
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array | |
int L[SIZ]; // used in counting sort | |
int Phi[SIZ]; // the jumping parameter | |
int LCP[SIZ]; // longest common prefix | |
int PLCP[SIZ]; // permuted longest common prefix | |
inline int getRA(int i) | |
{ | |
return (i < n) ? RA[i] : 0; | |
} | |
void radix_sort(int k) | |
{ | |
mem(L, 0); | |
// count frequencies | |
for(int i = 0; i < n; ++i) | |
{ | |
L[getRA(i + k)]++; | |
} | |
// save first index of every characters | |
int mx = max(n, 130); | |
for(int i = 0, s = 0; i < mx; ++i) | |
{ | |
int x = L[i]; | |
L[i] = s; | |
s += x; | |
} | |
// build sorted tempSA | |
for(int i = 0; i < n; ++i) | |
{ | |
int& x = L[getRA(SA[i] + k)]; | |
tempSA[x++] = SA[i]; | |
} | |
// copy tempSA to SA | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = tempSA[i]; | |
} | |
} | |
// text must ends with a $ | |
void buildSA() | |
{ | |
// initialize | |
n = strlen(T); | |
T[n++] = '$', T[n] = 0; // append $ | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = i; | |
RA[i] = T[i]; | |
} | |
// algorithm loop | |
for(int k = 1; k < n; k <<= 1) | |
{ | |
// sort by k-th ranks | |
radix_sort(k); | |
radix_sort(0); | |
// compute new ranks | |
tempRA[SA[0]] = 0; | |
for(int i = 1, r = 0; i < n; ++i) | |
{ | |
if(getRA(SA[i-1]) != getRA(SA[i])) { | |
r++; | |
} | |
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) { | |
r++; | |
} | |
tempRA[SA[i]] = r; | |
} | |
for(int i = 0; i < n; ++i) | |
{ | |
RA[i] = tempRA[i]; | |
} | |
if(RA[SA[n - 1]] == n - 1) break; | |
} | |
} | |
// Finds the Longest Common Prefix (LCP) between two adjacent suffixes | |
// excluding the first suffix using the Permuted LCP (PLCP) theorem. | |
void computeLCP() | |
{ | |
Phi[SA[0]] = -1; | |
for(int i = 1; i < n; ++i) | |
{ | |
Phi[SA[i]] = SA[i - 1]; | |
} | |
for(int i = 0, L = 0; i < n; ++i) | |
{ | |
if(Phi[i] == -1) | |
{ | |
PLCP[i] = 0; | |
continue; | |
} | |
// longest common prefix length | |
L = max(L - 1, 0); | |
while(T[i + L] == T[Phi[i] + L]) | |
{ | |
L++; | |
} | |
PLCP[i] = L; | |
} | |
for(int i = 0; i < n; ++i) | |
{ | |
LCP[i] = PLCP[SA[i]]; | |
} | |
} | |
void TEST() | |
{ | |
int values[] = { | |
10, | |
100, | |
1000, | |
10000, | |
50000, | |
100000, | |
500000, | |
1000000, | |
2000000, | |
3000000, | |
4000000, | |
5000000, | |
/*6000000, | |
7000000, | |
8000000*/ | |
}; | |
int siz = sizeof(values) / sizeof(int); | |
double avg_cpi = 0; | |
puts(""); | |
puts("| n | Runtime(s) | TPI(ms) |"); | |
puts("|----------:|:----------:|:------------:|"); | |
for(int k = 0; k < siz; ++k) | |
{ | |
int n = values[k]; | |
for(int i = 0; i < n; ++i) | |
{ | |
if(rand() & 1) | |
{ | |
T[i] = 'A' + (rand() % 26); | |
} | |
else if(rand() & 1) | |
{ | |
T[i] = 'a' + (rand() % 26); | |
} | |
else | |
{ | |
T[i] = '0' + (rand() % 10); | |
} | |
} | |
T[n] = 0; | |
time_t start = clock(); | |
buildSA(); // builds the suffix array | |
computeLCP(); // calculate longest common prefix | |
time_t stop = clock(); | |
double time = (double)(stop - start) / CLOCKS_PER_SEC; | |
double cpi = (double)(stop - start) / (n * log2(n)); | |
printf("| `%7d` | `%5.3f` | `%0.8f` |\n", n, time, cpi); | |
if(k) avg_cpi += (values[k] - values[k - 1]) * cpi; | |
else avg_cpi += values[k] * cpi; | |
} | |
avg_cpi /= values[siz - 1]; | |
printf("\n**Average *Time Per Instructions*** = `%.10f ms`\n", avg_cpi); | |
} | |
void RUN() | |
{ | |
/* | |
GATAGACA | |
abcabxabcd | |
*/ | |
gets(T); | |
buildSA(); | |
computeLCP(); | |
puts("============================================="); | |
printf("| i | SA | Phi | PLCP | suffix | LCP |\n"); | |
printf("|----|----|-----|------|--------------|-----|\n"); | |
for(int i = 0; i < n; ++i) | |
{ | |
printf("| %2d | %2d | %3d | %2d | %-12s | %3d |\n", i, SA[i], Phi[i], PLCP[i], T + i, LCP[i]); | |
} | |
puts("============================================="); | |
} | |
int main() | |
{ | |
//RUN(); | |
TEST(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Performance measures
1. Core i7-4790U @3.6GHz + 8GB RAM
10
0.015
0.45154499
100
0.025
0.03762875
1000
0.030
0.00301030
10000
0.036
0.00027093
50000
0.040
0.00005125
100000
0.045
0.00002709
500000
0.130
0.00001373
1000000
0.420
0.00002107
2000000
1.105
0.00002640
3000000
2.946
0.00004564
4000000
4.218
0.00004808
5000000
5.622
0.00005053
Average Time Per Instructions =
0.0000406254 ms