Last active
August 4, 2016 20:51
-
-
Save celic/bbb0c372782453405ae5a64eda852d31 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <vector> | |
#include <string> | |
#include <map> | |
using namespace std; | |
void find_substrings(int substr_len, string &text, map<string, vector<int>> &indexes){ | |
// First iteration | |
if(substr_len == 2){ | |
// Store all 2-tuples that appear in the text | |
for(int i = 0; i < len-(substr_len-1); i++){ | |
indexes[text.substr(i, substr_len)].push_back(i); | |
} | |
// All other iterations, don't just find all n-tuples naively like 2-tuples | |
// Any (n+1)-tuple must build upon an n-tuple, so just take the existing indexes and build on them | |
}else{ | |
vector<int> good_indexes; | |
map<string, vector<int>>::iterator itr; | |
// Store all the indexes of n-tuple substrings that occur more than once | |
for(itr = indexes.begin(); itr != indexes.end(); ++itr){ | |
for(int i = 0; i < itr->second.size(); ++i){ | |
good_indexes.push_back(itr->second[i]); | |
} | |
} | |
indexes.clear(); | |
// Extend the n-tuples to (n+1)-tuples and store like before | |
for(int i = 0; i < good_indexes.size(); i++){ | |
indexes[text.substr(good_indexes[i], substr_len)].push_back(good_indexes[i]); | |
} | |
} | |
} | |
void erase_substrings(map<string, vector<int>> &indexes){ | |
// Prune the map of any substrings that occur only once | |
map<string, vector<int>>::iterator itr = indexes.begin(); | |
while(itr != indexes.end()){ | |
if(itr->second.size() < 2){ | |
indexes.erase(itr++); | |
}else{ | |
++itr; | |
} | |
} | |
} | |
int len_LRS(string text){ | |
// String is the substring we are looking at, vector stores the indexes those substrings begin at | |
map<string, vector<int>> indexes; | |
int substr_len = 2; | |
// Progressively grow the length of the n-tuples to look for | |
while(true){ | |
find_substrings(substr_len, indexes); | |
erase_substrings(indexes); | |
if(indexes.empty()) break; | |
substr_len++; | |
} | |
// We always advance one further than we need to | |
return substr_len-1; | |
} | |
int main(){ | |
ifstream file; | |
string text; | |
string line; | |
file.open("moby_dick.txt"); | |
while(getline(file, line)) text += line; | |
file.close(); | |
//text = "abcdeabcdeabc"; | |
cout << len_LRS(text) << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Memory sensitive longest repeated substring algorithm. Rather than use the O(n^2)-memory dynamic programming approach, this takes a slightly slower route to conserve memory allowing you to run the algorithm on all of Moby Dick, for example, in a short amount of time.