Last active
November 4, 2017 20:39
-
-
Save jl2/4660503 to your computer and use it in GitHub Desktop.
A simple trie data structure in C++.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
trie: main.cpp Makefile | |
clang++ -O3 -std=c++11 -stdlib=libc++ -Wall -o trie main.cpp -lboost_system |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Simple trie data structure to play with c++11. | |
Copyright (c) 2017, Jeremiah LaRocco <[email protected]> | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted, provided that the above | |
copyright notice and this permission notice appear in all copies. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
*/ | |
#include <fstream> | |
#include <iostream> | |
#include <map> | |
#include <set> | |
#include <string> | |
#include <chrono> | |
#include <boost/algorithm/string.hpp> | |
namespace sc = std::chrono; | |
typedef std::set<std::string> wordset_t; | |
class trie_t; | |
typedef std::map<char, trie_t*> child_map_t; | |
class trie_t { | |
public: | |
trie_t(bool end = false) :_size(0), _isEnd(end) {} | |
~trie_t() { | |
for (auto &it : _children) { | |
delete it.second; | |
} | |
} | |
void addWord(std::string word) { | |
if (word.length()>0) { | |
// counts number of words added | |
// but if you add the same word more than once | |
// this size will differ from the set size | |
// which is unique words in trie | |
++_size; | |
std::string subword = word.substr(1, word.size()-1); | |
if (_children[word[0]]) { | |
if ( word.size() == 1 ) | |
_children[word[0]]->_isEnd = true; | |
else | |
_children[word[0]]->addWord(subword); | |
} else { | |
trie_t *tmp = new trie_t(word.size()==1); | |
tmp->addWord(subword); | |
_children[word[0]] = tmp; | |
} | |
} | |
} | |
bool isPrefix(std::string pref) const { | |
if (pref.length()== 0) { | |
return true; | |
} | |
if (_children.find(pref[0]) != _children.end()) { | |
return _children.find(pref[0])->second->isPrefix(pref.substr(1, pref.size()-1)); | |
} | |
return false; | |
} | |
bool isWord(std::string word) const { | |
if (word.length()== 0) { | |
return _isEnd; | |
} | |
std::string cursub; | |
trie_t const *tmp = this; | |
cursub = word; | |
while (cursub.length()>0) { | |
if (tmp->_children.find(cursub[0]) != tmp->_children.end()) { | |
tmp = tmp->_children.find(cursub[0])->second; | |
cursub = cursub.substr(1, cursub.size()-1); | |
} else { | |
return false; | |
} | |
} | |
return tmp->isWordEnd(); | |
} | |
size_t size() { | |
return _size; | |
} | |
void getWords(wordset_t &words, std::string wordSoFar="") const { | |
if (_isEnd) { | |
words.insert(wordSoFar); | |
} | |
for (const auto &it : _children) { | |
std::string tmp = wordSoFar + std::string(1, it.first); | |
if (it.second && it.second->isWordEnd()) { | |
words.insert(tmp); | |
} | |
it.second->getWords(words, tmp); | |
} | |
} | |
void getWordsStartingWith(std::string prefix, wordset_t &words, std::string wordSoFar="") const { | |
if (prefix.size() == 0) { | |
getWords(words, wordSoFar); | |
return; | |
} | |
std::string subword = prefix.substr(1, prefix.size()-1); | |
if (_children.find(prefix[0]) != _children.end()) { | |
trie_t *tmp = _children.find(prefix[0])->second; | |
std::string nwsf = wordSoFar + std::string(1, prefix[0]); | |
tmp->getWordsStartingWith(subword, words, nwsf); | |
} | |
} | |
private: | |
bool isWordEnd() const { | |
return _isEnd; | |
} | |
private: | |
child_map_t _children; | |
size_t _size; | |
bool _isEnd; | |
}; | |
void buildDictionaryTrie(trie_t &ot, std::string fname) { | |
// const std::regex wordRx("[a-z]+"); | |
std::ifstream inf(fname); | |
std::string curWord; | |
while (inf >> curWord) { | |
boost::to_lower(curWord); | |
if (curWord.find_first_not_of("abcdefghijklmnopqrstuvwxyz") == std::string::npos) { | |
// if (std::regex_match(curWord, wordRx)) { | |
ot.addWord(curWord); | |
} | |
} | |
} | |
sc::time_point<sc::steady_clock> tstamp() { | |
return sc::steady_clock::now(); | |
} | |
int main(int argc, char *argv[]) { | |
if (argc != 2) { | |
std::cout << "No word file given!\n"; | |
return 1; | |
} | |
std::string wordfile = argv[1]; | |
trie_t theTrie; | |
std::cout << "Building trie...\n"; | |
auto start = tstamp(); | |
buildDictionaryTrie(theTrie, wordfile); | |
auto end = tstamp(); | |
auto diff = end - start; | |
std::cout << "Building trie took " << sc::duration<double, std::milli>(diff).count() | |
<< " ms\n"; | |
std::cout << "The trie has " << theTrie.size() << " words.\n"; | |
// TODO: Find out why these sizes can be different - probably a bug | |
wordset_t wset; | |
theTrie.getWords(wset); | |
std::cout << "The word set has " << wset.size() << " words.\n"; | |
std::string prefix; | |
while (std::cin >> prefix) { | |
start = tstamp(); | |
bool isp = theTrie.isPrefix(prefix); | |
end = tstamp(); | |
diff = end-start; | |
std::cout << "Took " <<sc::duration<double, std::milli>(diff).count() | |
<< " to find that \"" << prefix << "\" " << (isp?"is":"is not") | |
<< " a prefix\n"; | |
wset.clear(); | |
if (isp) { | |
start = tstamp(); | |
theTrie.getWordsStartingWith(prefix, wset); | |
end = tstamp(); | |
diff = end-start; | |
std::cout << "Took " <<sc::duration<double, std::milli>(diff).count() | |
<< " to find " << wset.size() << " words starting with \"" | |
<< prefix << "\":\n"; | |
for (const auto &wrd : wset) { | |
std::cout << "\t" << wrd << "\n"; | |
} | |
} | |
} | |
return 0; | |
} |
Here is a fix for the foozap fooza issue:
void addWord(std::string word) {
if (word.length()>0) {
// counts number of words added
// but if you add the same word more than once
// this size will differ from the set size
// which is unique words in trie
++_size;
std::string subword = word.substr(1, word.size()-1);
if (_children[word[0]]) {
if ( word.size() == 1 )
_children[word[0]]->_isEnd = true;
else
_children[word[0]]->addWord(subword);
} else {
trie_t *tmp = new trie_t(word.size()==1);
tmp->addWord(subword);
_children[word[0]] = tmp;
}
}
}
Would you consider adding a license to your code? I'm working on a personal project that makes use of it, and in the future I may want to put that project up on GitHub or elsewhere. I don't think I can do that without a license.
Thanks!
That's neat, I had no idea people had found this and were using it.
Anyway, I've added a copyright statement and updated the addWord method, so hopefully it's more useful to people.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you add the following words:
you will notice that
fooza
does not get tagged with_isEnd = true
because it was previously added viafoozap
so addWord needs to be tweaked unless I'm missing something.Also in addWord size counts the number of words added but if you add the same word multiple times then that number will not sync with wordset_t.size() which reports unique words.