Created
March 3, 2017 19:56
-
-
Save PlushBeaver/e3c863cf9d0abb69824241942200564b to your computer and use it in GitHub Desktop.
Входной тест для http://cpp-school.unigine.com/ с поддержкой UTF-8 при помощи велосипеда.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <unordered_map> | |
#include <algorithm> | |
using namespace std; | |
using Code = uint16_t; | |
enum Symbol { Letter, Delimiter, Unsupported, Invalid, End }; | |
void append_utf8(string& to, Code code) ; | |
tuple<Symbol, Code> read_symbol_as_lowercase(istream& input) ; | |
int | |
main(int argc, char* argv[]) { | |
if (argc != 3) { | |
cerr << "Usage: " << argv[0] << " input.txt output.txt\n"; | |
return EXIT_FAILURE; | |
} | |
unordered_map<string, size_t> frequences; | |
ifstream input(argv[1]); | |
enum { Scanning, Skipping } state = Skipping; | |
string word; | |
while (input) { | |
const auto maybe_letter = read_symbol_as_lowercase(input); | |
const Symbol kind = get<0>(maybe_letter); | |
if (kind == Unsupported) { | |
throw runtime_error("unsupported symbol"); | |
} | |
if (kind == Invalid) { | |
throw runtime_error("invalid symbol"); | |
} | |
switch (state) { | |
case Skipping: | |
if (kind == Letter) { | |
word.clear(); | |
append_utf8(word, get<1>(maybe_letter)); | |
state = Scanning; | |
} | |
break; | |
case Scanning: | |
if (kind == Letter) { | |
append_utf8(word, get<1>(maybe_letter)); | |
} | |
else if (kind == Delimiter) { | |
frequences[word]++; | |
state = Skipping; | |
} | |
break; | |
} | |
} | |
ofstream output(argv[2]); | |
while (!frequences.empty()) { | |
using Pair = decltype(frequences)::value_type; | |
auto iterator = max_element(begin(frequences), end(frequences), | |
[](const Pair& lhs, const Pair& rhs) { | |
if (lhs.second == rhs.second) { | |
return lhs.first >= rhs.first; | |
} | |
return lhs.second < rhs.second; | |
}); | |
output << iterator->second << ' ' << iterator->first << '\n'; | |
frequences.erase(iterator); | |
} | |
return EXIT_SUCCESS; | |
} | |
tuple<Symbol, Code> | |
read_symbol_as_lowercase(istream& input) { | |
const auto first = static_cast<uint8_t>(input.get()); | |
if (!input) { | |
return make_tuple(End, first); | |
} | |
if (first < 0x80) { | |
const char symbol = first; | |
if (isalpha(symbol)) { | |
return make_tuple(Letter, tolower(symbol)); | |
} | |
return make_tuple(Delimiter, symbol); | |
} | |
if (!input) { | |
input.unget(); | |
return make_tuple(Invalid, first); | |
} | |
if ((first & 0xe0) != 0xc0) { | |
input.unget(); | |
return make_tuple(Unsupported, first); | |
} | |
const auto second = static_cast<uint8_t>(input.peek()); | |
const auto code = static_cast<Code>(((first & 0x1f) << 6) | (second & 0x3f)); | |
if (0x0410 <= code && code <= 0x042f) { | |
input.get(); | |
return make_tuple(Letter, code + 32); | |
} | |
else if ((0x0430 <= code && code <= 0x044f) || code == 0x0451) { | |
input.get(); | |
return make_tuple(Letter, code); | |
} | |
else if (code == 0x0401) { | |
input.get(); | |
return make_tuple(Letter, 0x0451); | |
} | |
else { | |
input.unget(); | |
return make_tuple(Unsupported, code); | |
} | |
} | |
void | |
append_utf8(string& to, Code code) { | |
if (code < 0x80) { | |
to += static_cast<char>(code); | |
} | |
else if (0x80 <= code && code <= 0x07ff) { | |
to += static_cast<char>((code >> 6) | 0xc0); | |
to += static_cast<char>((code & 0x3f) | 0x80); | |
} | |
else { | |
throw runtime_error("only two-byte UTF8 characters supported"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment