Last active
February 27, 2017 07:14
-
-
Save Starl1ght/0c1effd58dc580d5bc389e9b0f13fb32 to your computer and use it in GitHub Desktop.
unigine test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <fstream> | |
#include <iostream> | |
#include <map> | |
#include <vector> | |
#include <set> | |
// PREFACE | |
// Никогда не писал такие парсилки текста, как-то очень некрасиво вышло.. ;( | |
// Точно работает на MSVC2015 (по идее должно работать на MSVC2013, но не тестил) | |
// Парсит русский UTF-8, и лексикографически пытается сравнивать. | |
// С линуксом\виндой разницы быть не должно, благо UTF-8 везде одинаковый. | |
enum class FS { | |
FirstChar, | |
SecondChar, | |
}; | |
FS state = FS::FirstChar; | |
std::vector<uint8_t> buf; | |
std::map<std::vector<uint8_t>, uint32_t> wordMap; | |
uint8_t ruLowerCase(uint8_t ch) { | |
if (ch <= 0x9F) { | |
return ch += 32; | |
} | |
++buf.back(); // D1 | |
return ch - 0x1f; | |
} | |
FS ProcessFirstChar(uint8_t ch) { | |
if (ch >= 0x41 && ch <= 0x5A) { | |
buf.push_back(ch + 0x20); // To lowercase | |
return FS::FirstChar; | |
} | |
if (ch >= 0x61 && ch <= 0x7A) { | |
buf.push_back(ch); | |
return FS::FirstChar; | |
} | |
if (ch == 0xD0 || ch == 0xD1) { | |
buf.push_back(ch); | |
return FS::SecondChar; | |
} | |
if (!buf.empty()) { | |
buf.push_back(0); | |
++wordMap[buf]; | |
buf.clear(); | |
} | |
return FS::FirstChar; | |
} | |
FS ProcessSecondChar(uint8_t ch) { | |
if (uint8_t(buf.back()) == 0xD0 && ch >= 0x90 && ch <= 0xAF) { | |
ch = ruLowerCase(ch); | |
buf.push_back(ch); | |
return FS::FirstChar; | |
} | |
if (uint8_t(buf.back()) == 0xD0 && ch >= 0xB0 && ch <= 0xBF) { | |
buf.push_back(ch); | |
return FS::FirstChar; | |
} | |
if (uint8_t(buf.back()) == 0xD1 && ch >= 0x80 && ch <= 0x8F) { | |
buf.push_back(ch); | |
return FS::FirstChar; | |
} | |
buf.pop_back(); | |
if (!buf.empty()) { | |
buf.push_back(0); | |
++wordMap[buf]; | |
buf.clear(); | |
} | |
return FS::FirstChar; | |
} | |
void fsm(uint8_t ch) { | |
switch (state) { | |
case (FS::FirstChar): | |
state = ProcessFirstChar(ch); | |
break; | |
case (FS::SecondChar): | |
state = ProcessSecondChar(ch); | |
break; | |
} | |
} | |
uint64_t GetSize(std::ifstream& ifs) { | |
ifs.seekg(0, ifs.end); | |
const uint64_t fsize = ifs.tellg(); | |
ifs.seekg(0, ifs.beg); | |
return fsize; | |
} | |
int main(int argc, char** argv) { | |
if (argc != 3) { | |
std::cout << "Specify I\O files.\n"; | |
return -3; | |
} | |
std::ifstream ifs{ argv[1], std::ios::binary }; | |
if (!ifs.is_open()) { | |
std::cout << "Input '" << argv[1] << "' not opened. Exiting.\n"; | |
return -1; | |
} | |
const auto fsize = GetSize(ifs); | |
uint8_t ch; | |
for (uint64_t i = 0; i < fsize; ++i) { | |
ifs.read((char*)&ch, 1); | |
fsm(ch); | |
} | |
if (!buf.empty()) { | |
buf.push_back(0); | |
++wordMap[buf]; | |
} | |
using wordPair = std::pair<std::vector<uint8_t>, uint32_t>; | |
struct wpcompare { | |
bool operator()(const wordPair& pr1, const wordPair& pr2) const { | |
if (pr1.second > pr2.second) { | |
return true; | |
} | |
if (pr1.second < pr2.second) { | |
return false; | |
} | |
if (strcmp((const char*)pr1.first.data(), (const char*)pr2.first.data()) < 0) { | |
return true; | |
} | |
return false; | |
} | |
}; | |
std::set<wordPair, wpcompare> pairset; | |
for (const auto& pair : wordMap) { | |
pairset.emplace(pair.first, pair.second); | |
} | |
std::ofstream ofs{ argv[2], std::ios::trunc }; | |
if (!ofs.is_open()) { | |
std::cout << "Output '" << argv[2] << "' not opened. Exiting.\n"; | |
return -2; | |
} | |
for (const auto& pr : pairset) { | |
ofs << pr.first.data() << " " << pr.second << '\n'; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment