Skip to content

Instantly share code, notes, and snippets.

@goldsborough
Created October 19, 2015 10:19
Show Gist options
  • Save goldsborough/9d1d446f503cf94118a8 to your computer and use it in GitHub Desktop.
Save goldsborough/9d1d446f503cf94118a8 to your computer and use it in GitHub Desktop.
Algorithms to separate a string without whitespace back into constituent words.
std::vector<std::string> separate(const std::string& string,
const std::unordered_set<std::string>& dictionary)
{
std::vector<std::string> words;
std::string current;
for (auto character = string.begin(), end = string.end();
character != end;
++character)
{
current += *character;
for (std::size_t i = 0; i < current.size(); ++i)
{
auto substring = current.substr(i);
if (dictionary.count(substring))
{
if (i > 0)
{
auto before = current.substr(0, i);
std::transform(before.begin(),
before.end(),
before.begin(),
[&] (auto c) { return std::toupper(c); });
words.push_back(before);
}
words.push_back(substring);
current.clear();
}
}
}
return words;
}
class StringSeparator
{
public:
using dictionary_t = std::unordered_set<std::string>;
using result_t = std::pair<std::vector<std::string>,
std::pair<long, long>>;
using cache_t = std::unordered_map<std::string, result_t>;
static const dictionary_t default_dictionary;
std::vector<std::string> operator()(const std::string& string,
const dictionary_t& dictionary = default_dictionary)
{
cache_t cache;
return _separate(string, dictionary, cache).first;
}
private:
static const long maximum;
static const long minimum;
result_t _separate(const std::string& string,
const dictionary_t& dictionary,
cache_t& cache) const
{
if (string.empty()) return {{}, {0, 0}};
auto index = cache.find(string);
if (index != cache.end()) return index->second;
result_t best = {{}, {minimum, maximum}};
std::string current_string;
for (auto character = string.begin(), end = string.end();
character != end;
++character)
{
current_string += *character;
auto result = _separate(std::string(std::next(character),
string.end()),
dictionary,
cache);
bool contained = dictionary.count(current_string);
if (contained) result.second.first += 1;
else result.second.second += 1;
if ((result.second.first > best.second.first) ||
(result.second.first == best.second.first &&
result.second.second < best.second.second))
{
if (contained) best.first = {current_string};
else best.first = {to_upper(current_string)};
best.first.insert(best.first.end(),
result.first.begin(),
result.first.end());
best.second = result.second;
}
}
cache[string] = best;
return best;
}
std::string to_upper(std::string string) const
{
std::transform(string.begin(),
string.end(),
string.begin(),
[&] (char character)
{ return std::toupper(character); });
return string;
}
};
const StringSeparator::dictionary_t StringSeparator::default_dictionary = {
"looked",
"just",
"like",
"her",
"brother",
"the",
"dog",
"a",
"cat"
};
const long StringSeparator::maximum = std::numeric_limits<long>::max();
const long StringSeparator::minimum = std::numeric_limits<long>::min();
std::vector<std::string> separate_string(const std::string& string)
{
StringSeparator separate;
return separate(string);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment