Created
June 29, 2017 02:00
-
-
Save dipu-bd/13af7e0edb1d0cbc5d9ec56e07fb084c to your computer and use it in GitHub Desktop.
Search patterns within text using Suffix Array by utilizing C++ library functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*==================================================================== | |
Title: Computes Longest Repeated Substring from a given string | |
Complexity: O(n.log(n)) | |
Author : Sudipto Chandra (Dipu) | |
=====================================================================*/ | |
#include <bits/stdc++.h> | |
using namespace std; | |
#define mem(a,b) memset(a, b, sizeof(a)) | |
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i) | |
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i) | |
/*------------------------------------------------------------------------------------*/ | |
int test, cas = 1; | |
const int SIZ = 10000050; // maximum possible size | |
int n; // text length | |
int m; // pattern length | |
char T[SIZ]; // text | |
char P[SIZ]; // pattern | |
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes | |
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array | |
int L[SIZ]; // used in counting sort | |
inline int getRA(int i) | |
{ | |
return (i < n) ? RA[i] : 0; | |
} | |
void radix_sort(int k) | |
{ | |
mem(L, 0); | |
// count frequencies | |
for(int i = 0; i < n; ++i) | |
{ | |
L[getRA(i + k)]++; | |
} | |
// save first index of every characters | |
int mx = max(n, 130); | |
for(int i = 0, s = 0; i < mx; ++i) | |
{ | |
int x = L[i]; | |
L[i] = s; | |
s += x; | |
} | |
// build sorted tempSA | |
for(int i = 0; i < n; ++i) | |
{ | |
int& x = L[getRA(SA[i] + k)]; | |
tempSA[x++] = SA[i]; | |
} | |
// copy tempSA to SA | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = tempSA[i]; | |
} | |
} | |
// text must ends with a $ | |
void buildSA() | |
{ | |
// initialize | |
n = strlen(T); | |
T[n++] = '$', T[n] = 0; // append $ | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = i; | |
RA[i] = T[i]; | |
} | |
// algorithm loop | |
for(int k = 1; k < n; k <<= 1) | |
{ | |
// sort by k-th ranks | |
radix_sort(k); | |
radix_sort(0); | |
// compute new ranks | |
tempRA[SA[0]] = 0; | |
for(int i = 1, r = 0; i < n; ++i) | |
{ | |
if(getRA(SA[i-1]) != getRA(SA[i])) { | |
r++; | |
} | |
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) { | |
r++; | |
} | |
tempRA[SA[i]] = r; | |
} | |
for(int i = 0; i < n; ++i) | |
{ | |
RA[i] = tempRA[i]; | |
} | |
if(RA[SA[n - 1]] == n - 1) break; | |
} | |
} | |
// functions used to count occurrences of pattern | |
bool isLesser(int t, const char* p) | |
{ | |
return strncmp(T + t, p, m) < 0; | |
} | |
bool isGreater(const char* p, int t) | |
{ | |
return strncmp(T + t, p, m) > 0; | |
} | |
// Count the number of times a given pattern P appears | |
// in the string T. Complexity: O(n.log(n)) | |
int countOccurrences() | |
{ | |
m = strlen(P); | |
int first = lower_bound(SA, SA + n, P, isLesser) - SA; | |
int last = upper_bound(SA, SA + n, P, isGreater) - SA; | |
printf("%d %d\n", first, last); | |
return last - first; | |
} | |
int main() | |
{ | |
/* | |
GATAGACA | |
GA | |
abcabxabcd | |
abc | |
mississippi | |
i | |
*/ | |
printf("Text: "); | |
scanf("%s", T); | |
printf("Pattern: "); | |
scanf("%s", P); | |
buildSA(); | |
for(int i = 0; i < n; ++i) | |
{ | |
printf("%2d | %2d | %s\n", i, SA[i], T + SA[i]); | |
} | |
int ans = countOccurrences(); | |
printf("Pattern found %d times\n", ans); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment