Skip to content

Instantly share code, notes, and snippets.

@dipu-bd
Last active June 29, 2017 11:13
Show Gist options
  • Save dipu-bd/1df39b57b3b23957ba3abd432d97c36a to your computer and use it in GitHub Desktop.
Save dipu-bd/1df39b57b3b23957ba3abd432d97c36a to your computer and use it in GitHub Desktop.
Finding Longest Common Prefix (LCP) using Suffix Array. Complexity: SA=O(n.log(n)), LCP=O(n)
/*====================================================================
Title: Compute LCP from Suffix Array using Radix Sort
Suffix Array: O(n.log(n))
Compute LCP: O(n)
Author : Sudipto Chandra (Dipu)
=====================================================================*/
#include <bits/stdc++.h>
using namespace std;
#define mem(a,b) memset(a, b, sizeof(a))
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i)
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i)
/*------------------------------------------------------------------------------------*/
int test, cas = 1;
const int SIZ = 10000050; // maximum possible size
int n; // text length
char T[SIZ]; // text
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array
int L[SIZ]; // used in counting sort
int Phi[SIZ]; // the jumping parameter
int LCP[SIZ]; // longest common prefix
int PLCP[SIZ]; // permuted longest common prefix
inline int getRA(int i)
{
return (i < n) ? RA[i] : 0;
}
void radix_sort(int k)
{
mem(L, 0);
// count frequencies
for(int i = 0; i < n; ++i)
{
L[getRA(i + k)]++;
}
// save first index of every characters
int mx = max(n, 130);
for(int i = 0, s = 0; i < mx; ++i)
{
int x = L[i];
L[i] = s;
s += x;
}
// build sorted tempSA
for(int i = 0; i < n; ++i)
{
int& x = L[getRA(SA[i] + k)];
tempSA[x++] = SA[i];
}
// copy tempSA to SA
for(int i = 0; i < n; ++i)
{
SA[i] = tempSA[i];
}
}
// text must ends with a $
void buildSA()
{
// initialize
n = strlen(T);
T[n++] = '$', T[n] = 0; // append $
for(int i = 0; i < n; ++i)
{
SA[i] = i;
RA[i] = T[i];
}
// algorithm loop
for(int k = 1; k < n; k <<= 1)
{
// sort by k-th ranks
radix_sort(k);
radix_sort(0);
// compute new ranks
tempRA[SA[0]] = 0;
for(int i = 1, r = 0; i < n; ++i)
{
if(getRA(SA[i-1]) != getRA(SA[i])) {
r++;
}
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) {
r++;
}
tempRA[SA[i]] = r;
}
for(int i = 0; i < n; ++i)
{
RA[i] = tempRA[i];
}
if(RA[SA[n - 1]] == n - 1) break;
}
}
// Finds the Longest Common Prefix (LCP) between two adjacent suffixes
// excluding the first suffix using the Permuted LCP (PLCP) theorem.
void computeLCP()
{
Phi[SA[0]] = -1;
for(int i = 1; i < n; ++i)
{
Phi[SA[i]] = SA[i - 1];
}
for(int i = 0, L = 0; i < n; ++i)
{
if(Phi[i] == -1)
{
PLCP[i] = 0;
continue;
}
// longest common prefix length
L = max(L - 1, 0);
while(T[i + L] == T[Phi[i] + L])
{
L++;
}
PLCP[i] = L;
}
for(int i = 0; i < n; ++i)
{
LCP[i] = PLCP[SA[i]];
}
}
void TEST()
{
int values[] = {
10,
100,
1000,
10000,
50000,
100000,
500000,
1000000,
2000000,
3000000,
4000000,
5000000,
/*6000000,
7000000,
8000000*/
};
int siz = sizeof(values) / sizeof(int);
double avg_cpi = 0;
puts("");
puts("| n | Runtime(s) | TPI(ms) |");
puts("|----------:|:----------:|:------------:|");
for(int k = 0; k < siz; ++k)
{
int n = values[k];
for(int i = 0; i < n; ++i)
{
if(rand() & 1)
{
T[i] = 'A' + (rand() % 26);
}
else if(rand() & 1)
{
T[i] = 'a' + (rand() % 26);
}
else
{
T[i] = '0' + (rand() % 10);
}
}
T[n] = 0;
time_t start = clock();
buildSA(); // builds the suffix array
computeLCP(); // calculate longest common prefix
time_t stop = clock();
double time = (double)(stop - start) / CLOCKS_PER_SEC;
double cpi = (double)(stop - start) / (n * log2(n));
printf("| `%7d` | `%5.3f` | `%0.8f` |\n", n, time, cpi);
if(k) avg_cpi += (values[k] - values[k - 1]) * cpi;
else avg_cpi += values[k] * cpi;
}
avg_cpi /= values[siz - 1];
printf("\n**Average *Time Per Instructions*** = `%.10f ms`\n", avg_cpi);
}
void RUN()
{
/*
GATAGACA
abcabxabcd
*/
gets(T);
buildSA();
computeLCP();
puts("=============================================");
printf("| i | SA | Phi | PLCP | suffix | LCP |\n");
printf("|----|----|-----|------|--------------|-----|\n");
for(int i = 0; i < n; ++i)
{
printf("| %2d | %2d | %3d | %2d | %-12s | %3d |\n", i, SA[i], Phi[i], PLCP[i], T + i, LCP[i]);
}
puts("=============================================");
}
int main()
{
//RUN();
TEST();
return 0;
}
@dipu-bd
Copy link
Author

dipu-bd commented Jun 28, 2017

Performance measures

1. Core i7-4790U @3.6GHz + 8GB RAM

n Runtime(s) TPI(ms)
10 0.015 0.45154499
100 0.025 0.03762875
1000 0.030 0.00301030
10000 0.036 0.00027093
50000 0.040 0.00005125
100000 0.045 0.00002709
500000 0.130 0.00001373
1000000 0.420 0.00002107
2000000 1.105 0.00002640
3000000 2.946 0.00004564
4000000 4.218 0.00004808
5000000 5.622 0.00005053

Average Time Per Instructions = 0.0000406254 ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment