Created
February 22, 2017 18:58
-
-
Save gourytch/12f32234824a360a4e8171152d78bd9f to your computer and use it in GitHub Desktop.
test work for UNIGINE class
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// lazy quick-and-dirty solution for UNIGINE class | |
// made as an object for procrastination. | |
// about 4..5 hours of holiday was spent | |
// tested on g++ | |
// usage: | |
// freqs infile1 [...] outfile | |
// # build dictionary | |
// # from words in all input files | |
// # and write result to output file | |
// | |
// ./freqs | |
// # build dictionary | |
// # from internal test string | |
// # and print to stdout | |
//-------------------------------------------------------------------- | |
#include <ctype.h> | |
#include <fstream> | |
#include <iostream> | |
#include <istream> | |
#include <map> | |
#include <ostream> | |
#include <set> | |
#include <sstream> | |
// minimal size of word (in chars) | |
const size_t MIN_WORD_SIZE = 1; | |
// text for quick self-testing | |
const char text[] = | |
"The time has come, the walrus said, to talk of many things."; | |
/* | |
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, " | |
"sed do eiusmod tempor incididunt ut labore et dolore " | |
"magna aliqua. Ut enim ad minim veniam, quis nostrud " | |
"exercitation ullamco laboris nisi ut aliquip ex ea commodo " | |
"consequat. Duis aute irure dolor in reprehenderit in " | |
"voluptate velit esse cillum dolore eu fugiat nulla " | |
"pariatur. Excepteur sint occaecat cupidatat non proident, " | |
"sunt in culpa qui officia deserunt mollit anim id est " | |
"laborum."; | |
*/ | |
// container for words and their amounts | |
typedef std::map<std::string, int> Dict; | |
// comparable element for frequency dictionary | |
// purpose is more clear than std::pair<int, std::string> | |
struct OrderPair { | |
int count; | |
std::string word; | |
}; | |
// comparison function for ordered set | |
// sort them by count in reverse order, then by word | |
bool operator<(const OrderPair& a, const OrderPair& b) { | |
if (a.count > b.count) return true; // this is reverse order! | |
if (b.count > a.count) return false; | |
if (a.word < b.word) return true; | |
return false; | |
} | |
// let's make bunch of word+count entries automagically sorted | |
typedef std::set<OrderPair> Freq; | |
// return next word from input stream | |
// or empty string if there were no words left | |
// function retrieves only words from [a-z] alphabet | |
// (may be it will extended to russian codeset but not now) | |
std::string next_word(std::istream& is) { | |
int ch; | |
std::ostringstream buf; | |
do { // at first, search for a start of word | |
if (is.eof()) return std::string(); | |
ch = ::tolower(is.get()); | |
} while (!::isalpha(ch)); | |
do { | |
buf << static_cast<char>(ch); | |
ch = ::tolower(is.get()); | |
} while (!is.eof() && ::isalpha(ch)); | |
return buf.str(); | |
} | |
// retrieve and count words from input stream | |
void collect(Dict& dict, std::istream& is) { | |
std::string word; | |
// retrieving and counting words from the input stream | |
for (;;) { | |
word = next_word(is); | |
if (word.empty()) break; | |
if (word.size() < MIN_WORD_SIZE) continue; // skip if too small | |
Dict::iterator i = dict.find(word); | |
if (i == dict.end()) { | |
dict.insert({word, 1}); | |
} else { | |
(*i).second++; | |
} | |
} | |
} | |
// create ordered dataset | |
void make_freq(Freq& freq, Dict& dict) { | |
// sorting collected words with counts in desired order | |
for (Dict::iterator i = dict.begin(); i != dict.end(); ++i) { | |
freq.insert({(*i).second, (*i).first}); | |
} | |
} | |
// print dictionary | |
void printout(std::ostream& os, Freq& freq) { | |
for (Freq::iterator i = freq.begin(); i != freq.end(); ++i) { | |
os << (*i).count << " " << (*i).word << std::endl; | |
} | |
} | |
// perform test without external file | |
void test() { | |
std::istringstream is(text); | |
Dict dict; | |
Freq freq; | |
collect(dict, is); | |
make_freq(freq, dict); | |
printout(std::cout, freq); | |
} | |
int main(int argc, char ** argv) { | |
if (argc <= 1) { // some sort of self-test. can be zero on MVS | |
test(); | |
} else { | |
Dict dict; | |
Freq freq; | |
// we will collect data from all input files | |
// to one dictionary | |
for (int i = 1; i < argc-1; i++) { | |
std::ifstream is(argv[i]); | |
if (is.is_open()) { | |
collect(dict, is); | |
} else { | |
std::cerr << "file open error: " | |
<< argv[i] << std::endl; | |
} | |
} | |
make_freq(freq, dict); | |
std::ofstream os(argv[argc-1]); | |
printout(os, freq); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment