Skip to content

Instantly share code, notes, and snippets.

@PlushBeaver
Created March 3, 2017 19:56
Show Gist options
  • Save PlushBeaver/e3c863cf9d0abb69824241942200564b to your computer and use it in GitHub Desktop.
Save PlushBeaver/e3c863cf9d0abb69824241942200564b to your computer and use it in GitHub Desktop.
Входной тест для http://cpp-school.unigine.com/ с поддержкой UTF-8 при помощи велосипеда.
#include <iostream>
#include <fstream>
#include <unordered_map>
#include <algorithm>
using namespace std;
using Code = uint16_t;
enum Symbol { Letter, Delimiter, Unsupported, Invalid, End };
void append_utf8(string& to, Code code) ;
tuple<Symbol, Code> read_symbol_as_lowercase(istream& input) ;
int
main(int argc, char* argv[]) {
if (argc != 3) {
cerr << "Usage: " << argv[0] << " input.txt output.txt\n";
return EXIT_FAILURE;
}
unordered_map<string, size_t> frequences;
ifstream input(argv[1]);
enum { Scanning, Skipping } state = Skipping;
string word;
while (input) {
const auto maybe_letter = read_symbol_as_lowercase(input);
const Symbol kind = get<0>(maybe_letter);
if (kind == Unsupported) {
throw runtime_error("unsupported symbol");
}
if (kind == Invalid) {
throw runtime_error("invalid symbol");
}
switch (state) {
case Skipping:
if (kind == Letter) {
word.clear();
append_utf8(word, get<1>(maybe_letter));
state = Scanning;
}
break;
case Scanning:
if (kind == Letter) {
append_utf8(word, get<1>(maybe_letter));
}
else if (kind == Delimiter) {
frequences[word]++;
state = Skipping;
}
break;
}
}
ofstream output(argv[2]);
while (!frequences.empty()) {
using Pair = decltype(frequences)::value_type;
auto iterator = max_element(begin(frequences), end(frequences),
[](const Pair& lhs, const Pair& rhs) {
if (lhs.second == rhs.second) {
return lhs.first >= rhs.first;
}
return lhs.second < rhs.second;
});
output << iterator->second << ' ' << iterator->first << '\n';
frequences.erase(iterator);
}
return EXIT_SUCCESS;
}
tuple<Symbol, Code>
read_symbol_as_lowercase(istream& input) {
const auto first = static_cast<uint8_t>(input.get());
if (!input) {
return make_tuple(End, first);
}
if (first < 0x80) {
const char symbol = first;
if (isalpha(symbol)) {
return make_tuple(Letter, tolower(symbol));
}
return make_tuple(Delimiter, symbol);
}
if (!input) {
input.unget();
return make_tuple(Invalid, first);
}
if ((first & 0xe0) != 0xc0) {
input.unget();
return make_tuple(Unsupported, first);
}
const auto second = static_cast<uint8_t>(input.peek());
const auto code = static_cast<Code>(((first & 0x1f) << 6) | (second & 0x3f));
if (0x0410 <= code && code <= 0x042f) {
input.get();
return make_tuple(Letter, code + 32);
}
else if ((0x0430 <= code && code <= 0x044f) || code == 0x0451) {
input.get();
return make_tuple(Letter, code);
}
else if (code == 0x0401) {
input.get();
return make_tuple(Letter, 0x0451);
}
else {
input.unget();
return make_tuple(Unsupported, code);
}
}
void
append_utf8(string& to, Code code) {
if (code < 0x80) {
to += static_cast<char>(code);
}
else if (0x80 <= code && code <= 0x07ff) {
to += static_cast<char>((code >> 6) | 0xc0);
to += static_cast<char>((code & 0x3f) | 0x80);
}
else {
throw runtime_error("only two-byte UTF8 characters supported");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment