Created
October 19, 2015 10:19
-
-
Save goldsborough/9d1d446f503cf94118a8 to your computer and use it in GitHub Desktop.
Algorithms to separate a string without whitespace back into constituent words.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
std::vector<std::string> separate(const std::string& string, | |
const std::unordered_set<std::string>& dictionary) | |
{ | |
std::vector<std::string> words; | |
std::string current; | |
for (auto character = string.begin(), end = string.end(); | |
character != end; | |
++character) | |
{ | |
current += *character; | |
for (std::size_t i = 0; i < current.size(); ++i) | |
{ | |
auto substring = current.substr(i); | |
if (dictionary.count(substring)) | |
{ | |
if (i > 0) | |
{ | |
auto before = current.substr(0, i); | |
std::transform(before.begin(), | |
before.end(), | |
before.begin(), | |
[&] (auto c) { return std::toupper(c); }); | |
words.push_back(before); | |
} | |
words.push_back(substring); | |
current.clear(); | |
} | |
} | |
} | |
return words; | |
} | |
class StringSeparator | |
{ | |
public: | |
using dictionary_t = std::unordered_set<std::string>; | |
using result_t = std::pair<std::vector<std::string>, | |
std::pair<long, long>>; | |
using cache_t = std::unordered_map<std::string, result_t>; | |
static const dictionary_t default_dictionary; | |
std::vector<std::string> operator()(const std::string& string, | |
const dictionary_t& dictionary = default_dictionary) | |
{ | |
cache_t cache; | |
return _separate(string, dictionary, cache).first; | |
} | |
private: | |
static const long maximum; | |
static const long minimum; | |
result_t _separate(const std::string& string, | |
const dictionary_t& dictionary, | |
cache_t& cache) const | |
{ | |
if (string.empty()) return {{}, {0, 0}}; | |
auto index = cache.find(string); | |
if (index != cache.end()) return index->second; | |
result_t best = {{}, {minimum, maximum}}; | |
std::string current_string; | |
for (auto character = string.begin(), end = string.end(); | |
character != end; | |
++character) | |
{ | |
current_string += *character; | |
auto result = _separate(std::string(std::next(character), | |
string.end()), | |
dictionary, | |
cache); | |
bool contained = dictionary.count(current_string); | |
if (contained) result.second.first += 1; | |
else result.second.second += 1; | |
if ((result.second.first > best.second.first) || | |
(result.second.first == best.second.first && | |
result.second.second < best.second.second)) | |
{ | |
if (contained) best.first = {current_string}; | |
else best.first = {to_upper(current_string)}; | |
best.first.insert(best.first.end(), | |
result.first.begin(), | |
result.first.end()); | |
best.second = result.second; | |
} | |
} | |
cache[string] = best; | |
return best; | |
} | |
std::string to_upper(std::string string) const | |
{ | |
std::transform(string.begin(), | |
string.end(), | |
string.begin(), | |
[&] (char character) | |
{ return std::toupper(character); }); | |
return string; | |
} | |
}; | |
const StringSeparator::dictionary_t StringSeparator::default_dictionary = { | |
"looked", | |
"just", | |
"like", | |
"her", | |
"brother", | |
"the", | |
"dog", | |
"a", | |
"cat" | |
}; | |
const long StringSeparator::maximum = std::numeric_limits<long>::max(); | |
const long StringSeparator::minimum = std::numeric_limits<long>::min(); | |
std::vector<std::string> separate_string(const std::string& string) | |
{ | |
StringSeparator separate; | |
return separate(string); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment