dipu-bd · June 29, 2017 11:13 · dipu-bd · Jun 28, 2017
diff --git a/SuffixArray+LCP.cpp b/SuffixArray+LCP.cpp
 /*====================================================================
 Title: Compute LCP from Suffix Array using Radix Sort
 Suffix Array: O(n.log(n))
 Compute LCP: O(n)
 Author : Sudipto Chandra (Dipu)
 =====================================================================*/
 #include <bits/stdc++.h>
 using namespace std;
 #define mem(a,b) memset(a, b, sizeof(a))
 #define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i)
 #define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i)
 /*------------------------------------------------------------------------------------*/

 int test, cas = 1;

 const int SIZ = 10000050; // maximum possible size

 int n; // text length
 char T[SIZ]; // text
 int SA[SIZ], tempSA[SIZ]; // the sorted suffixes
 int RA[SIZ], tempRA[SIZ]; // ranks of suffix array
 int L[SIZ]; // used in counting sort
 int Phi[SIZ]; // the jumping parameter
 int LCP[SIZ]; // longest common prefix
 int PLCP[SIZ]; // permuted longest common prefix

 inline int getRA(int i)
 {
    return (i < n) ? RA[i] : 0;
 }

 void radix_sort(int k)
 {
    mem(L, 0);
    // count frequencies
    for(int i = 0; i < n; ++i)
    {
        L[getRA(i + k)]++;
    }
    // save first index of every characters
    int mx = max(n, 130);
    for(int i = 0, s = 0; i < mx; ++i)
    {
        int x = L[i];
        L[i] = s;
        s += x;
    }
    // build sorted tempSA
    for(int i = 0; i < n; ++i)
    {
        int& x = L[getRA(SA[i] + k)];
        tempSA[x++] = SA[i];
    }
    // copy tempSA to SA
    for(int i = 0; i < n; ++i)
    {
        SA[i] = tempSA[i];
    }
 }
 // text must ends with a $
 void buildSA()
 {
    // initialize
    n = strlen(T);
    T[n++] = '$', T[n] = 0; // append $
    for(int i = 0; i < n; ++i)
    {
        SA[i] = i;
        RA[i] = T[i];
    }

    // algorithm loop
    for(int k = 1; k < n; k <<= 1)
    {
        // sort by k-th ranks
        radix_sort(k);
        radix_sort(0);
        // compute new ranks
        tempRA[SA[0]] = 0;
        for(int i = 1, r = 0; i < n; ++i)
        {
            if(getRA(SA[i-1]) != getRA(SA[i])) {
                r++;
            }
            else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) {
                r++;
            }
            tempRA[SA[i]] = r;
        }
        for(int i = 0; i < n; ++i)
        {
            RA[i] = tempRA[i];
        }
        if(RA[SA[n - 1]] == n - 1) break;
    }
 }

 // Finds the Longest Common Prefix (LCP) between two adjacent suffixes
 // excluding the first suffix using the Permuted LCP (PLCP) theorem.
 void computeLCP()
 {
    Phi[SA[0]] = -1;
    for(int i = 1; i < n; ++i)
    {
        Phi[SA[i]] = SA[i - 1];
    }

    for(int i = 0, L = 0; i < n; ++i)
    {
        if(Phi[i] == -1)
        {
            PLCP[i] = 0;
            continue;
        }
        // longest common prefix length
        L = max(L - 1, 0);
        while(T[i + L] == T[Phi[i] + L])
        {
            L++;
        }
        PLCP[i] = L;
    }

    for(int i = 0; i < n; ++i)
    {
        LCP[i] = PLCP[SA[i]];
    }
 }

 void TEST()
 {
    int values[] = {
        10,
        100,
        1000,
        10000,
        50000,
        100000,
        500000,
        1000000,
        2000000,
        3000000,
        4000000,
        5000000,
        /*6000000,
        7000000,
        8000000*/
    };

    int siz = sizeof(values) / sizeof(int);
    double avg_cpi = 0;

    puts("");
    puts("|         n | Runtime(s) |    TPI(ms)   |");
    puts("|----------:|:----------:|:------------:|");

    for(int k = 0; k < siz; ++k)
    {
        int n = values[k];
        for(int i = 0; i < n; ++i)
        {
            if(rand() & 1)
            {
                T[i] = 'A' + (rand() % 26);
            }
            else if(rand() & 1)
            {
                T[i] = 'a' + (rand() % 26);
            }
            else
            {
                T[i] = '0' + (rand() % 10);
            }

        }
        T[n] = 0;

        time_t start = clock();
        buildSA(); // builds the suffix array
        computeLCP(); // calculate longest common prefix
        time_t stop = clock();

        double time = (double)(stop - start) / CLOCKS_PER_SEC;
        double cpi = (double)(stop - start) / (n * log2(n));
        printf("| `%7d` |   `%5.3f`  | `%0.8f` |\n", n, time, cpi);

        if(k) avg_cpi += (values[k] - values[k - 1]) * cpi;
        else avg_cpi += values[k] * cpi;
    }
    avg_cpi /= values[siz - 1];
    printf("\n**Average *Time Per Instructions*** = `%.10f ms`\n", avg_cpi);
 }

 void RUN()
 {
    /*
        GATAGACA
        abcabxabcd
    */

    gets(T);
    buildSA();
    computeLCP();

    puts("=============================================");
    printf("|  i | SA | Phi | PLCP |    suffix    | LCP |\n");
    printf("|----|----|-----|------|--------------|-----|\n");
    for(int i = 0; i < n; ++i)
    {
        printf("| %2d | %2d | %3d |  %2d  | %-12s | %3d |\n", i, SA[i], Phi[i], PLCP[i], T + i, LCP[i]);
    }
    puts("=============================================");


 }

 int main()
 {
    //RUN();
    TEST();
    return 0;
 }
	/*====================================================================
	Title: Compute LCP from Suffix Array using Radix Sort
	Suffix Array: O(n.log(n))
	Compute LCP: O(n)
	Author : Sudipto Chandra (Dipu)
	=====================================================================*/
	#include <bits/stdc++.h>
	using namespace std;
	#define mem(a,b) memset(a, b, sizeof(a))
	#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i)
	#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i)
	/------------------------------------------------------------------------------------/

	int test, cas = 1;

	const int SIZ = 10000050; // maximum possible size

	int n; // text length
	char T[SIZ]; // text
	int SA[SIZ], tempSA[SIZ]; // the sorted suffixes
	int RA[SIZ], tempRA[SIZ]; // ranks of suffix array
	int L[SIZ]; // used in counting sort
	int Phi[SIZ]; // the jumping parameter
	int LCP[SIZ]; // longest common prefix
	int PLCP[SIZ]; // permuted longest common prefix

	inline int getRA(int i)
	{
	return (i < n) ? RA[i] : 0;
	}

	void radix_sort(int k)
	{
	mem(L, 0);
	// count frequencies
	for(int i = 0; i < n; ++i)
	{
	L[getRA(i + k)]++;
	}
	// save first index of every characters
	int mx = max(n, 130);
	for(int i = 0, s = 0; i < mx; ++i)
	{
	int x = L[i];
	L[i] = s;
	s += x;
	}
	// build sorted tempSA
	for(int i = 0; i < n; ++i)
	{
	int& x = L[getRA(SA[i] + k)];
	tempSA[x++] = SA[i];
	}
	// copy tempSA to SA
	for(int i = 0; i < n; ++i)
	{
	SA[i] = tempSA[i];
	}
	}
	// text must ends with a $
	void buildSA()
	{
	// initialize
	n = strlen(T);
	T[n++] = '$', T[n] = 0; // append $
	for(int i = 0; i < n; ++i)
	{
	SA[i] = i;
	RA[i] = T[i];
	}

	// algorithm loop
	for(int k = 1; k < n; k <<= 1)
	{
	// sort by k-th ranks
	radix_sort(k);
	radix_sort(0);
	// compute new ranks
	tempRA[SA[0]] = 0;
	for(int i = 1, r = 0; i < n; ++i)
	{
	if(getRA(SA[i-1]) != getRA(SA[i])) {
	r++;
	}
	else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) {
	r++;
	}
	tempRA[SA[i]] = r;
	}
	for(int i = 0; i < n; ++i)
	{
	RA[i] = tempRA[i];
	}
	if(RA[SA[n - 1]] == n - 1) break;
	}
	}

	// Finds the Longest Common Prefix (LCP) between two adjacent suffixes
	// excluding the first suffix using the Permuted LCP (PLCP) theorem.
	void computeLCP()
	{
	Phi[SA[0]] = -1;
	for(int i = 1; i < n; ++i)
	{
	Phi[SA[i]] = SA[i - 1];
	}

	for(int i = 0, L = 0; i < n; ++i)
	{
	if(Phi[i] == -1)
	{
	PLCP[i] = 0;
	continue;
	}
	// longest common prefix length
	L = max(L - 1, 0);
	while(T[i + L] == T[Phi[i] + L])
	{
	L++;
	}
	PLCP[i] = L;
	}

	for(int i = 0; i < n; ++i)
	{
	LCP[i] = PLCP[SA[i]];
	}
	}

	void TEST()
	{
	int values[] = {
	10,
	100,
	1000,
	10000,
	50000,
	100000,
	500000,
	1000000,
	2000000,
	3000000,
	4000000,
	5000000,
	/*6000000,
	7000000,
	8000000*/
	};

	int siz = sizeof(values) / sizeof(int);
	double avg_cpi = 0;

	puts("");
	puts("\| n \| Runtime(s) \| TPI(ms) \|");
	puts("\|----------:\|:----------:\|:------------:\|");

	for(int k = 0; k < siz; ++k)
	{
	int n = values[k];
	for(int i = 0; i < n; ++i)
	{
	if(rand() & 1)
	{
	T[i] = 'A' + (rand() % 26);
	}
	else if(rand() & 1)
	{
	T[i] = 'a' + (rand() % 26);
	}
	else
	{
	T[i] = '0' + (rand() % 10);
	}

	}
	T[n] = 0;

	time_t start = clock();
	buildSA(); // builds the suffix array
	computeLCP(); // calculate longest common prefix
	time_t stop = clock();

	double time = (double)(stop - start) / CLOCKS_PER_SEC;
	double cpi = (double)(stop - start) / (n * log2(n));
	printf("\| `%7d` \| `%5.3f` \| `%0.8f` \|\n", n, time, cpi);

	if(k) avg_cpi += (values[k] - values[k - 1]) * cpi;
	else avg_cpi += values[k] * cpi;
	}
	avg_cpi /= values[siz - 1];
	printf("\n*Average Time Per Instructions*** = `%.10f ms`\n", avg_cpi);
	}

	void RUN()
	{
	/*
	GATAGACA
	abcabxabcd
	*/

	gets(T);
	buildSA();
	computeLCP();

	puts("=============================================");
	printf("\| i \| SA \| Phi \| PLCP \| suffix \| LCP \|\n");
	printf("\|----\|----\|-----\|------\|--------------\|-----\|\n");
	for(int i = 0; i < n; ++i)
	{
	printf("\| %2d \| %2d \| %3d \| %2d \| %-12s \| %3d \|\n", i, SA[i], Phi[i], PLCP[i], T + i, LCP[i]);
	}
	puts("=============================================");


	}

	int main()
	{
	//RUN();
	TEST();
	return 0;
	}
n	Runtime(s)	TPI(ms)
`10`	`0.015`	`0.45154499`
`100`	`0.025`	`0.03762875`
`1000`	`0.030`	`0.00301030`
`10000`	`0.036`	`0.00027093`
`50000`	`0.040`	`0.00005125`
`100000`	`0.045`	`0.00002709`
`500000`	`0.130`	`0.00001373`
`1000000`	`0.420`	`0.00002107`
`2000000`	`1.105`	`0.00002640`
`3000000`	`2.946`	`0.00004564`
`4000000`	`4.218`	`0.00004808`
`5000000`	`5.622`	`0.00005053`