Created
January 8, 2020 05:20
-
-
Save cdacamar/e4644dad6f054fefa76c2286bd166bcb to your computer and use it in GitHub Desktop.
Word frequencies
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// cl /EHsc /std:c++latest /permissive- /O2 word_frequencies.cpp | |
#include <cctype> | |
#include <cstdio> | |
#include <algorithm> | |
#include <charconv> | |
#include <fstream> | |
#include <string> | |
#include <string_view> | |
#include <unordered_map> | |
#include <vector> | |
using WordFrequencyMap = std::unordered_map<std::string, int>; | |
constexpr auto isalpha_surrogate = [](auto c) { return !!std::isalpha(c); }; | |
static void extract_words_from_line(const std::string& line, WordFrequencyMap& frequency_map) { | |
auto first = std::begin(line); | |
auto last = std::end(line); | |
while (first != last) { | |
first = std::find_if(first, last, isalpha_surrogate); | |
if (first == last) { | |
return; | |
} | |
auto word_end = std::find_if_not(first, last, isalpha_surrogate); | |
++frequency_map[std::string{ first, word_end }]; | |
first = word_end; | |
} | |
} | |
static WordFrequencyMap bucket_word_frequencies(std::ifstream& file) { | |
WordFrequencyMap frequency_map; | |
std::string line; | |
while (std::getline(file, line)) { | |
extract_words_from_line(line, frequency_map); | |
} | |
return frequency_map; | |
} | |
using WordsAndFrequencies = std::vector<std::pair<std::string, int>>; | |
WordsAndFrequencies flatten(const WordFrequencyMap& frequency_map) { | |
WordsAndFrequencies flattened; | |
flattened.reserve(frequency_map.size()); | |
for (auto& [word, frequency] : frequency_map) { | |
flattened.push_back({ word, frequency }); | |
} | |
return flattened; | |
} | |
using TopNSorted = std::pair<WordsAndFrequencies::iterator, WordsAndFrequencies::iterator>; | |
TopNSorted top_n(WordsAndFrequencies& words, int top) { | |
auto partition_point = [&] { | |
if (top <= -1 || top >= words.size()) { | |
return std::end(words); | |
} | |
auto first = std::begin(words); | |
auto p = first + top; | |
std::nth_element(first, p, std::end(words), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; }); | |
return p; | |
}(); | |
auto first = std::begin(words); | |
std::sort(first, partition_point, [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); | |
return { first, partition_point }; | |
} | |
int main(int argc, const char* argv[]) { | |
if (argc < 2) { | |
std::printf(".\\%s: not enough arguments\n", argv[0]); | |
return 1; | |
} | |
int top = -1; // means take all. | |
if (argc > 2) { | |
std::string_view top_n_str { argv[2] }; | |
auto [_, ec] = std::from_chars(top_n_str.data(), top_n_str.data() + top_n_str.length(), top); | |
if (ec != std::errc{}) { | |
std::printf(".\\%s: failed to parse \"%s\"\n", argv[0], argv[2]); | |
return 1; | |
} | |
if (top < -1) { | |
std::printf(".\\%s: argument \"%s\" cannot be negative.\n", argv[0], argv[2]); | |
return 1; | |
} | |
} | |
std::ifstream file{argv[1]}; | |
if (!file) { | |
std::printf(".\\%s: could not open file \"%s\" for reading.\n", argv[0], argv[1]); | |
return 1; | |
} | |
auto word_frequencies = bucket_word_frequencies(file); | |
auto flattened = flatten(word_frequencies); | |
auto [first, last] = top_n(flattened, top); | |
while (first != last) { | |
auto [word, frequency] = *first; | |
std::printf("word: \"%s\", appears: %d\n", word.c_str(), frequency); | |
++first; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment