cdacamar · January 8, 2020 05:20
diff --git a/frequencies.cpp b/frequencies.cpp
 // cl /EHsc /std:c++latest /permissive- /O2 word_frequencies.cpp
 #include <cctype>
 #include <cstdio>

 #include <algorithm>
 #include <charconv>
 #include <fstream>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <vector>

 using WordFrequencyMap = std::unordered_map<std::string, int>;

 constexpr auto isalpha_surrogate = [](auto c) { return !!std::isalpha(c); };

 static void extract_words_from_line(const std::string& line, WordFrequencyMap& frequency_map) {
  auto first = std::begin(line);
  auto last = std::end(line);
  while (first != last) {
    first = std::find_if(first, last, isalpha_surrogate);
    if (first == last) {
      return;
    }
    auto word_end = std::find_if_not(first, last, isalpha_surrogate);
    ++frequency_map[std::string{ first, word_end }];
    first = word_end;
  }
 }

 static WordFrequencyMap bucket_word_frequencies(std::ifstream& file) {
  WordFrequencyMap frequency_map;
  std::string line;
  while (std::getline(file, line)) {
    extract_words_from_line(line, frequency_map);
  }
  return frequency_map;
 }

 using WordsAndFrequencies = std::vector<std::pair<std::string, int>>;

 WordsAndFrequencies flatten(const WordFrequencyMap& frequency_map) {
  WordsAndFrequencies flattened;
  flattened.reserve(frequency_map.size());
  for (auto& [word, frequency] : frequency_map) {
    flattened.push_back({ word, frequency });
  }
  return flattened;
 }

 using TopNSorted = std::pair<WordsAndFrequencies::iterator, WordsAndFrequencies::iterator>;

 TopNSorted top_n(WordsAndFrequencies& words, int top) {
  auto partition_point = [&] {
    if (top <= -1 || top >= words.size()) {
      return std::end(words);
    }
    auto first = std::begin(words);
    auto p = first + top;
    std::nth_element(first, p, std::end(words), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
    return p;
  }();
  auto first = std::begin(words);
  std::sort(first, partition_point, [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
  return { first, partition_point };
 }

 int main(int argc, const char* argv[]) {
  if (argc < 2) {
    std::printf(".\\%s: not enough arguments\n", argv[0]);
    return 1;
  }
  int top = -1; // means take all.
  if (argc > 2) {
    std::string_view top_n_str { argv[2] };
    auto [_, ec] = std::from_chars(top_n_str.data(), top_n_str.data() + top_n_str.length(), top);
    if (ec != std::errc{}) {
      std::printf(".\\%s: failed to parse \"%s\"\n", argv[0], argv[2]);
      return 1;
    }
    if (top < -1) {
      std::printf(".\\%s: argument \"%s\" cannot be negative.\n", argv[0], argv[2]);
      return 1;
    }
  }
  std::ifstream file{argv[1]};
  if (!file) {
    std::printf(".\\%s: could not open file \"%s\" for reading.\n", argv[0], argv[1]);
    return 1;
  }
  auto word_frequencies = bucket_word_frequencies(file);
  auto flattened = flatten(word_frequencies);
  auto [first, last] = top_n(flattened, top);
  while (first != last) {
    auto [word, frequency] = *first;
    std::printf("word: \"%s\", appears: %d\n", word.c_str(), frequency);
    ++first;
  }
 }
	// cl /EHsc /std:c++latest /permissive- /O2 word_frequencies.cpp
	#include <cctype>
	#include <cstdio>

	#include <algorithm>
	#include <charconv>
	#include <fstream>
	#include <string>
	#include <string_view>
	#include <unordered_map>
	#include <vector>

	using WordFrequencyMap = std::unordered_map<std::string, int>;

	constexpr auto isalpha_surrogate = [](auto c) { return !!std::isalpha(c); };

	static void extract_words_from_line(const std::string& line, WordFrequencyMap& frequency_map) {
	auto first = std::begin(line);
	auto last = std::end(line);
	while (first != last) {
	first = std::find_if(first, last, isalpha_surrogate);
	if (first == last) {
	return;
	}
	auto word_end = std::find_if_not(first, last, isalpha_surrogate);
	++frequency_map[std::string{ first, word_end }];
	first = word_end;
	}
	}

	static WordFrequencyMap bucket_word_frequencies(std::ifstream& file) {
	WordFrequencyMap frequency_map;
	std::string line;
	while (std::getline(file, line)) {
	extract_words_from_line(line, frequency_map);
	}
	return frequency_map;
	}

	using WordsAndFrequencies = std::vector<std::pair<std::string, int>>;

	WordsAndFrequencies flatten(const WordFrequencyMap& frequency_map) {
	WordsAndFrequencies flattened;
	flattened.reserve(frequency_map.size());
	for (auto& [word, frequency] : frequency_map) {
	flattened.push_back({ word, frequency });
	}
	return flattened;
	}

	using TopNSorted = std::pair<WordsAndFrequencies::iterator, WordsAndFrequencies::iterator>;

	TopNSorted top_n(WordsAndFrequencies& words, int top) {
	auto partition_point = [&] {
	if (top <= -1 \|\| top >= words.size()) {
	return std::end(words);
	}
	auto first = std::begin(words);
	auto p = first + top;
	std::nth_element(first, p, std::end(words), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
	return p;
	}();
	auto first = std::begin(words);
	std::sort(first, partition_point, [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
	return { first, partition_point };
	}

	int main(int argc, const char* argv[]) {
	if (argc < 2) {
	std::printf(".\\%s: not enough arguments\n", argv[0]);
	return 1;
	}
	int top = -1; // means take all.
	if (argc > 2) {
	std::string_view top_n_str { argv[2] };
	auto [_, ec] = std::from_chars(top_n_str.data(), top_n_str.data() + top_n_str.length(), top);
	if (ec != std::errc{}) {
	std::printf(".\\%s: failed to parse \"%s\"\n", argv[0], argv[2]);
	return 1;
	}
	if (top < -1) {
	std::printf(".\\%s: argument \"%s\" cannot be negative.\n", argv[0], argv[2]);
	return 1;
	}
	}
	std::ifstream file{argv[1]};
	if (!file) {
	std::printf(".\\%s: could not open file \"%s\" for reading.\n", argv[0], argv[1]);
	return 1;
	}
	auto word_frequencies = bucket_word_frequencies(file);
	auto flattened = flatten(word_frequencies);
	auto [first, last] = top_n(flattened, top);
	while (first != last) {
	auto [word, frequency] = *first;
	std::printf("word: \"%s\", appears: %d\n", word.c_str(), frequency);
	++first;
	}
	}