Created
March 4, 2013 22:23
-
-
Save anonymous/5086232 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<unistd.h> | |
#include<algorithm> | |
#include<map> | |
#include<vector> | |
#include<string> | |
#include<cassert> | |
#include<sstream> | |
#include<fstream> | |
using namespace std; | |
char fix_character(char c) | |
{ | |
if(c >= 'a' && c <= 'z') | |
return c; | |
if(c >= 'A' && c <= 'Z') | |
return c + 32; | |
return -1; | |
} | |
string fix_characters(string s) | |
{ | |
string new_string; | |
for(size_t i = 0; i < s.size(); i++) { | |
char c = fix_character(s[i]); | |
if(c != -1) new_string += c; | |
} | |
return new_string; | |
} | |
#define TRIE_SIZE 26 | |
struct Trie { | |
vector<Trie*> children; | |
bool endpoint; | |
Trie() | |
{ | |
this->children.assign(TRIE_SIZE, NULL); | |
this->endpoint = false; | |
} | |
void add(string word) | |
{ | |
word = fix_characters(word); | |
add(word, 0); | |
} | |
void find_and_add_to_map(string word, map<string, unsigned int> &hits) | |
{ | |
find_and_add_to_map(word, hits, 0); | |
} | |
private: | |
void add(string &word, size_t index) | |
{ | |
if(index == word.size()) { | |
this->endpoint = true; | |
} else { | |
size_t character = word[index] - 'a'; | |
if(character > TRIE_SIZE) { | |
cerr << "TURBO: Refusing to add " << word << " to search trie." << endl; | |
return; | |
} | |
if(this->children[character] == NULL) | |
this->children[character] = new Trie; | |
this->children[character]->add(word, index + 1); | |
} | |
} | |
void find_and_add_to_map(string &word, map<string, unsigned int> &hits, size_t index) | |
{ | |
if(this->endpoint) | |
hits[word.substr(0, index)]++; | |
if(index == word.size() + 1) | |
return; | |
size_t character = word[index] - 'a'; | |
if(character > TRIE_SIZE) // error | |
return; | |
if(this->children[character] == NULL) | |
return; | |
this->children[character]->find_and_add_to_map(word, hits, index + 1); | |
} | |
}; | |
Trie build_search_trie() | |
{ | |
Trie root; | |
ifstream search("search.csv"); | |
string line; | |
while(getline(search, line)) { | |
stringstream ss(line); | |
getline(ss, line, ';'); | |
root.add(line); | |
} | |
cerr << "TURBO: Done building search trie." << endl; | |
return root; | |
} | |
map<string, unsigned int> corpus_search(Trie &root) | |
{ | |
ifstream corpus("corpus.csv"); | |
string line; | |
map<string, unsigned int> hits; | |
while(getline(corpus, line)) { | |
stringstream ss(line); | |
for(size_t i = 0; i <= 3 && getline(ss, line, '|'); i++) { | |
if(i == 0) continue; | |
line = fix_characters(line); | |
for(size_t k = 0; k < line.size(); k++) | |
root.find_and_add_to_map(line.substr(k, line.size() - k), hits); | |
} | |
} | |
return hits; | |
} | |
int main() | |
{ | |
Trie root = build_search_trie(); | |
auto hits = corpus_search(root); | |
// cout << "term;count" << endl; | |
// for(auto hit : hits) | |
// cout << hit.first << ";" << hit.second << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment