Last active
June 29, 2017 01:55
-
-
Save dipu-bd/1c01fa6fb4a8f461565eb6c7436af98f to your computer and use it in GitHub Desktop.
Count occurrences of a sub-string within the given text using Suffix Array
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*==================================================================== | |
Title: Computes Longest Repeated Substring from a given string | |
Complexity: O(n.log(n)) | |
Author : Sudipto Chandra (Dipu) | |
=====================================================================*/ | |
#include <bits/stdc++.h> | |
using namespace std; | |
#define mem(a,b) memset(a, b, sizeof(a)) | |
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i) | |
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i) | |
/*------------------------------------------------------------------------------------*/ | |
int test, cas = 1; | |
const int SIZ = 10000050; // maximum possible size | |
int n; // text length | |
int m; // pattern length | |
char T[SIZ]; // text | |
char P[SIZ]; // pattern | |
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes | |
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array | |
int L[SIZ]; // used in counting sort | |
inline int getRA(int i) | |
{ | |
return (i < n) ? RA[i] : 0; | |
} | |
void radix_sort(int k) | |
{ | |
mem(L, 0); | |
// count frequencies | |
for(int i = 0; i < n; ++i) | |
{ | |
L[getRA(i + k)]++; | |
} | |
// save first index of every characters | |
int mx = max(n, 130); | |
for(int i = 0, s = 0; i < mx; ++i) | |
{ | |
int x = L[i]; | |
L[i] = s; | |
s += x; | |
} | |
// build sorted tempSA | |
for(int i = 0; i < n; ++i) | |
{ | |
int& x = L[getRA(SA[i] + k)]; | |
tempSA[x++] = SA[i]; | |
} | |
// copy tempSA to SA | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = tempSA[i]; | |
} | |
} | |
// text must ends with a $ | |
void buildSA() | |
{ | |
// initialize | |
n = strlen(T); | |
T[n++] = '$', T[n] = 0; // append $ | |
for(int i = 0; i < n; ++i) | |
{ | |
SA[i] = i; | |
RA[i] = T[i]; | |
} | |
// algorithm loop | |
for(int k = 1; k < n; k <<= 1) | |
{ | |
// sort by k-th ranks | |
radix_sort(k); | |
radix_sort(0); | |
// compute new ranks | |
tempRA[SA[0]] = 0; | |
for(int i = 1, r = 0; i < n; ++i) | |
{ | |
if(getRA(SA[i-1]) != getRA(SA[i])) { | |
r++; | |
} | |
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) { | |
r++; | |
} | |
tempRA[SA[i]] = r; | |
} | |
for(int i = 0; i < n; ++i) | |
{ | |
RA[i] = tempRA[i]; | |
} | |
if(RA[SA[n - 1]] == n - 1) break; | |
} | |
} | |
// Count the number of times a given pattern P appears | |
// in the string T. Complexity: O(n.log(n)) | |
int countOccurences() | |
{ | |
m = strlen(P); | |
SA[n] = n; | |
int first, last; | |
int low, high, mid; | |
// find lower bound | |
low = 0, high = n; | |
while(low < high) | |
{ | |
mid = (low + high) >> 1; | |
printf("lower: %d %d %d\n", low, mid, high); | |
if(strncmp(P, T + SA[mid], m) <= 0) | |
{ | |
high = mid; | |
} | |
else | |
{ | |
low = mid + 1; | |
} | |
} | |
printf("lower: %d %d %d\n", low, mid, high); | |
first = low; | |
// find higher bound | |
low = 0, high = n; | |
while(low < high) | |
{ | |
mid = (low + high) >> 1; | |
printf("upper: %d %d %d\n", low, mid, high); | |
if(strncmp(P, T + SA[mid], m) < 0) | |
{ | |
high = mid; | |
} | |
else | |
{ | |
low = mid + 1; | |
} | |
} | |
printf("upper: %d %d %d\n", low, mid, high); | |
last = low; | |
printf("%d %d\n", first, last); | |
return last - first; | |
} | |
int main() | |
{ | |
/* | |
GATAGACA | |
GA | |
abcabxabcd | |
abc | |
mississippi | |
i | |
*/ | |
printf("Text: "); | |
scanf("%s", T); | |
printf("Pattern: "); | |
scanf("%s", P); | |
buildSA(); | |
for(int i = 0; i < n; ++i) | |
{ | |
printf("%2d | %2d | %s\n", i, SA[i], T + SA[i]); | |
} | |
int ans = countOccurences(); | |
printf("Pattern found %d times\n", ans); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment