PlushBeaver · March 3, 2017 19:56
diff --git a/kozlyuk_utf8.cc b/kozlyuk_utf8.cc
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
 #include <algorithm>

 using namespace std;

 using Code = uint16_t;

 enum Symbol { Letter, Delimiter, Unsupported, Invalid, End };

 void append_utf8(string& to, Code code) ;
 tuple<Symbol, Code> read_symbol_as_lowercase(istream& input) ;

 int
 main(int argc, char* argv[]) {
    if (argc != 3) {
        cerr << "Usage: " << argv[0] << " input.txt output.txt\n";
        return EXIT_FAILURE;
    }

    unordered_map<string, size_t> frequences;

    ifstream input(argv[1]);
    enum { Scanning, Skipping } state = Skipping;
    string word;
    while (input) {
        const auto maybe_letter = read_symbol_as_lowercase(input);
        const Symbol kind = get<0>(maybe_letter);
        if (kind == Unsupported) {
            throw runtime_error("unsupported symbol");
        }
        if (kind == Invalid) {
            throw runtime_error("invalid symbol");
        }
        switch (state) {
        case Skipping:
            if (kind == Letter) {
                word.clear();
                append_utf8(word, get<1>(maybe_letter));
                state = Scanning;
            }
            break;
        case Scanning:
            if (kind == Letter) {
                append_utf8(word, get<1>(maybe_letter));
            }
            else if (kind == Delimiter) {
                frequences[word]++;
                state = Skipping;
            }
            break;
        }
    }

    ofstream output(argv[2]);
    while (!frequences.empty()) {
        using Pair = decltype(frequences)::value_type;
        auto iterator = max_element(begin(frequences), end(frequences),
                [](const Pair& lhs, const Pair& rhs) {
                    if (lhs.second == rhs.second) {
                        return lhs.first >= rhs.first;
                    }
                    return lhs.second < rhs.second;
                });
        output << iterator->second << ' ' << iterator->first << '\n';
        frequences.erase(iterator);
    }

    return EXIT_SUCCESS;
 }

 tuple<Symbol, Code>
 read_symbol_as_lowercase(istream& input) {
    const auto first = static_cast<uint8_t>(input.get());
    if (!input) {
        return make_tuple(End, first);
    }

    if (first < 0x80) {
        const char symbol = first;
        if (isalpha(symbol)) {
            return make_tuple(Letter, tolower(symbol));
        }
        return make_tuple(Delimiter, symbol);
    }

    if (!input) {
        input.unget();
        return make_tuple(Invalid, first);
    }

    if ((first & 0xe0) != 0xc0) {
        input.unget();
        return make_tuple(Unsupported, first);
    }

    const auto second = static_cast<uint8_t>(input.peek());
    const auto code = static_cast<Code>(((first & 0x1f) << 6) | (second & 0x3f));
    if (0x0410 <= code && code <= 0x042f) {
        input.get();
        return make_tuple(Letter, code + 32);
    }
    else if ((0x0430 <= code && code <= 0x044f) || code == 0x0451) {
        input.get();
        return make_tuple(Letter, code);
    }
    else if (code == 0x0401) {
        input.get();
        return make_tuple(Letter, 0x0451);
    }
    else {
        input.unget();
        return make_tuple(Unsupported, code);
    }
 }

 void
 append_utf8(string& to, Code code) {
    if (code < 0x80) {
        to += static_cast<char>(code);
    }
    else if (0x80 <= code && code <= 0x07ff) {
        to += static_cast<char>((code >> 6) | 0xc0);
        to += static_cast<char>((code & 0x3f) | 0x80);
    }
    else {
        throw runtime_error("only two-byte UTF8 characters supported");
    }
 }
	#include <iostream>
	#include <fstream>
	#include <unordered_map>
	#include <algorithm>

	using namespace std;

	using Code = uint16_t;

	enum Symbol { Letter, Delimiter, Unsupported, Invalid, End };

	void append_utf8(string& to, Code code) ;
	tuple<Symbol, Code> read_symbol_as_lowercase(istream& input) ;

	int
	main(int argc, char* argv[]) {
	if (argc != 3) {
	cerr << "Usage: " << argv[0] << " input.txt output.txt\n";
	return EXIT_FAILURE;
	}

	unordered_map<string, size_t> frequences;

	ifstream input(argv[1]);
	enum { Scanning, Skipping } state = Skipping;
	string word;
	while (input) {
	const auto maybe_letter = read_symbol_as_lowercase(input);
	const Symbol kind = get<0>(maybe_letter);
	if (kind == Unsupported) {
	throw runtime_error("unsupported symbol");
	}
	if (kind == Invalid) {
	throw runtime_error("invalid symbol");
	}
	switch (state) {
	case Skipping:
	if (kind == Letter) {
	word.clear();
	append_utf8(word, get<1>(maybe_letter));
	state = Scanning;
	}
	break;
	case Scanning:
	if (kind == Letter) {
	append_utf8(word, get<1>(maybe_letter));
	}
	else if (kind == Delimiter) {
	frequences[word]++;
	state = Skipping;
	}
	break;
	}
	}

	ofstream output(argv[2]);
	while (!frequences.empty()) {
	using Pair = decltype(frequences)::value_type;
	auto iterator = max_element(begin(frequences), end(frequences),
	[](const Pair& lhs, const Pair& rhs) {
	if (lhs.second == rhs.second) {
	return lhs.first >= rhs.first;
	}
	return lhs.second < rhs.second;
	});
	output << iterator->second << ' ' << iterator->first << '\n';
	frequences.erase(iterator);
	}

	return EXIT_SUCCESS;
	}

	tuple<Symbol, Code>
	read_symbol_as_lowercase(istream& input) {
	const auto first = static_cast<uint8_t>(input.get());
	if (!input) {
	return make_tuple(End, first);
	}

	if (first < 0x80) {
	const char symbol = first;
	if (isalpha(symbol)) {
	return make_tuple(Letter, tolower(symbol));
	}
	return make_tuple(Delimiter, symbol);
	}

	if (!input) {
	input.unget();
	return make_tuple(Invalid, first);
	}

	if ((first & 0xe0) != 0xc0) {
	input.unget();
	return make_tuple(Unsupported, first);
	}

	const auto second = static_cast<uint8_t>(input.peek());
	const auto code = static_cast<Code>(((first & 0x1f) << 6) \| (second & 0x3f));
	if (0x0410 <= code && code <= 0x042f) {
	input.get();
	return make_tuple(Letter, code + 32);
	}
	else if ((0x0430 <= code && code <= 0x044f) \|\| code == 0x0451) {
	input.get();
	return make_tuple(Letter, code);
	}
	else if (code == 0x0401) {
	input.get();
	return make_tuple(Letter, 0x0451);
	}
	else {
	input.unget();
	return make_tuple(Unsupported, code);
	}
	}

	void
	append_utf8(string& to, Code code) {
	if (code < 0x80) {
	to += static_cast<char>(code);
	}
	else if (0x80 <= code && code <= 0x07ff) {
	to += static_cast<char>((code >> 6) \| 0xc0);
	to += static_cast<char>((code & 0x3f) \| 0x80);
	}
	else {
	throw runtime_error("only two-byte UTF8 characters supported");
	}
	}