Skip to content

Instantly share code, notes, and snippets.

@gourytch
Created February 22, 2017 18:58
Show Gist options
  • Save gourytch/12f32234824a360a4e8171152d78bd9f to your computer and use it in GitHub Desktop.
Save gourytch/12f32234824a360a4e8171152d78bd9f to your computer and use it in GitHub Desktop.
test work for UNIGINE class
//
// lazy quick-and-dirty solution for UNIGINE class
// made as an object for procrastination.
// about 4..5 hours of holiday was spent
// tested on g++
// usage:
// freqs infile1 [...] outfile
// # build dictionary
// # from words in all input files
// # and write result to output file
//
// ./freqs
// # build dictionary
// # from internal test string
// # and print to stdout
//--------------------------------------------------------------------
#include <ctype.h>
#include <fstream>
#include <iostream>
#include <istream>
#include <map>
#include <ostream>
#include <set>
#include <sstream>
// minimal size of word (in chars)
const size_t MIN_WORD_SIZE = 1;
// text for quick self-testing
const char text[] =
"The time has come, the walrus said, to talk of many things.";
/*
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
"sed do eiusmod tempor incididunt ut labore et dolore "
"magna aliqua. Ut enim ad minim veniam, quis nostrud "
"exercitation ullamco laboris nisi ut aliquip ex ea commodo "
"consequat. Duis aute irure dolor in reprehenderit in "
"voluptate velit esse cillum dolore eu fugiat nulla "
"pariatur. Excepteur sint occaecat cupidatat non proident, "
"sunt in culpa qui officia deserunt mollit anim id est "
"laborum.";
*/
// container for words and their amounts
typedef std::map<std::string, int> Dict;
// comparable element for frequency dictionary
// purpose is more clear than std::pair<int, std::string>
struct OrderPair {
int count;
std::string word;
};
// comparison function for ordered set
// sort them by count in reverse order, then by word
bool operator<(const OrderPair& a, const OrderPair& b) {
if (a.count > b.count) return true; // this is reverse order!
if (b.count > a.count) return false;
if (a.word < b.word) return true;
return false;
}
// let's make bunch of word+count entries automagically sorted
typedef std::set<OrderPair> Freq;
// return next word from input stream
// or empty string if there were no words left
// function retrieves only words from [a-z] alphabet
// (may be it will extended to russian codeset but not now)
std::string next_word(std::istream& is) {
int ch;
std::ostringstream buf;
do { // at first, search for a start of word
if (is.eof()) return std::string();
ch = ::tolower(is.get());
} while (!::isalpha(ch));
do {
buf << static_cast<char>(ch);
ch = ::tolower(is.get());
} while (!is.eof() && ::isalpha(ch));
return buf.str();
}
// retrieve and count words from input stream
void collect(Dict& dict, std::istream& is) {
std::string word;
// retrieving and counting words from the input stream
for (;;) {
word = next_word(is);
if (word.empty()) break;
if (word.size() < MIN_WORD_SIZE) continue; // skip if too small
Dict::iterator i = dict.find(word);
if (i == dict.end()) {
dict.insert({word, 1});
} else {
(*i).second++;
}
}
}
// create ordered dataset
void make_freq(Freq& freq, Dict& dict) {
// sorting collected words with counts in desired order
for (Dict::iterator i = dict.begin(); i != dict.end(); ++i) {
freq.insert({(*i).second, (*i).first});
}
}
// print dictionary
void printout(std::ostream& os, Freq& freq) {
for (Freq::iterator i = freq.begin(); i != freq.end(); ++i) {
os << (*i).count << " " << (*i).word << std::endl;
}
}
// perform test without external file
void test() {
std::istringstream is(text);
Dict dict;
Freq freq;
collect(dict, is);
make_freq(freq, dict);
printout(std::cout, freq);
}
int main(int argc, char ** argv) {
if (argc <= 1) { // some sort of self-test. can be zero on MVS
test();
} else {
Dict dict;
Freq freq;
// we will collect data from all input files
// to one dictionary
for (int i = 1; i < argc-1; i++) {
std::ifstream is(argv[i]);
if (is.is_open()) {
collect(dict, is);
} else {
std::cerr << "file open error: "
<< argv[i] << std::endl;
}
}
make_freq(freq, dict);
std::ofstream os(argv[argc-1]);
printout(os, freq);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment