gourytch · February 22, 2017 18:58
diff --git a/freqs.cpp b/freqs.cpp
 //
 // lazy quick-and-dirty solution for UNIGINE class
 // made as an object for procrastination.
 // about 4..5 hours of holiday was spent
 // tested on g++
 // usage:
 // freqs infile1 [...] outfile
 //    # build dictionary
 //    # from words in all input files
 //    # and write result to output file
 //
 // ./freqs
 //    # build dictionary
 //    # from internal test string
 //    # and print to stdout
 //--------------------------------------------------------------------

 #include <ctype.h>
 #include <fstream>
 #include <iostream>
 #include <istream>
 #include <map>
 #include <ostream>
 #include <set>
 #include <sstream>


 // minimal size of word (in chars)
 const size_t MIN_WORD_SIZE = 1;


 // text for quick self-testing
 const char text[] =
        "The time has come, the walrus said, to talk of many things.";
 /*
        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
        "sed do eiusmod tempor incididunt ut labore et dolore "
        "magna aliqua. Ut enim ad minim veniam, quis nostrud "
        "exercitation ullamco laboris nisi ut aliquip ex ea commodo "
        "consequat. Duis aute irure dolor in reprehenderit in "
        "voluptate velit esse cillum dolore eu fugiat nulla "
        "pariatur. Excepteur sint occaecat cupidatat non proident, "
        "sunt in culpa qui officia deserunt mollit anim id est "
        "laborum.";
 */


 // container for words and their amounts
 typedef std::map<std::string, int> Dict;


 // comparable element for frequency dictionary
 // purpose is more clear than std::pair<int, std::string>
 struct OrderPair {
    int count;
    std::string word;
 };


 // comparison function for ordered set
 // sort them by count in reverse order, then by word
 bool operator<(const OrderPair& a, const OrderPair& b) {
    if (a.count > b.count) return true; // this is reverse order!
    if (b.count > a.count) return false;
    if (a.word < b.word) return true;
    return false;
 }


 // let's make bunch of word+count entries automagically sorted
 typedef std::set<OrderPair> Freq;


 // return next word from input stream
 // or empty string if there were no words left
 // function retrieves only words from [a-z] alphabet
 // (may be it will extended to russian codeset but not now)
 std::string next_word(std::istream& is) {
    int ch;
    std::ostringstream buf;
    do { // at first, search for a start of word
        if (is.eof()) return std::string();
        ch = ::tolower(is.get());
    } while (!::isalpha(ch));
    do {
        buf << static_cast<char>(ch);
        ch = ::tolower(is.get());
    } while (!is.eof() && ::isalpha(ch));
    return buf.str();
 }


 // retrieve and count words from input stream
 void collect(Dict& dict, std::istream& is) {
    std::string word;
    // retrieving and counting words from the input stream
    for (;;) {
        word = next_word(is);
        if (word.empty()) break;
        if (word.size() < MIN_WORD_SIZE) continue; // skip if too small
        Dict::iterator i = dict.find(word);
        if (i == dict.end()) {
            dict.insert({word, 1});
        } else {
            (*i).second++;
        }
    }
 }


 // create ordered dataset
 void make_freq(Freq& freq, Dict& dict) {
    // sorting collected words with counts in desired order
    for (Dict::iterator i = dict.begin(); i != dict.end(); ++i) {
        freq.insert({(*i).second, (*i).first});
    }
 }


 // print dictionary
 void printout(std::ostream& os, Freq& freq) {
    for (Freq::iterator i = freq.begin(); i != freq.end(); ++i) {
        os << (*i).count << " " << (*i).word << std::endl;
    }
 }


 // perform test without external file
 void test() {
    std::istringstream is(text);
    Dict dict;
    Freq freq;
    collect(dict, is);
    make_freq(freq, dict);
    printout(std::cout, freq);
 }


 int main(int argc, char ** argv) {
    if (argc <= 1) { // some sort of self-test. can be zero on MVS
        test();
    } else {
        Dict dict;
        Freq freq;
        // we will collect data from all input files
        // to one dictionary
        for (int i = 1; i < argc-1; i++) {
            std::ifstream is(argv[i]);
            if (is.is_open()) {
                collect(dict, is);
            } else {
                std::cerr << "file open error: "
                          << argv[i] << std::endl;
            }
        }
        make_freq(freq, dict);
        std::ofstream os(argv[argc-1]);
        printout(os, freq);
    }
    return 0;
 }
	//
	// lazy quick-and-dirty solution for UNIGINE class
	// made as an object for procrastination.
	// about 4..5 hours of holiday was spent
	// tested on g++
	// usage:
	// freqs infile1 [...] outfile
	// # build dictionary
	// # from words in all input files
	// # and write result to output file
	//
	// ./freqs
	// # build dictionary
	// # from internal test string
	// # and print to stdout
	//--------------------------------------------------------------------

	#include <ctype.h>
	#include <fstream>
	#include <iostream>
	#include <istream>
	#include <map>
	#include <ostream>
	#include <set>
	#include <sstream>


	// minimal size of word (in chars)
	const size_t MIN_WORD_SIZE = 1;


	// text for quick self-testing
	const char text[] =
	"The time has come, the walrus said, to talk of many things.";
	/*
	"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
	"sed do eiusmod tempor incididunt ut labore et dolore "
	"magna aliqua. Ut enim ad minim veniam, quis nostrud "
	"exercitation ullamco laboris nisi ut aliquip ex ea commodo "
	"consequat. Duis aute irure dolor in reprehenderit in "
	"voluptate velit esse cillum dolore eu fugiat nulla "
	"pariatur. Excepteur sint occaecat cupidatat non proident, "
	"sunt in culpa qui officia deserunt mollit anim id est "
	"laborum.";
	*/


	// container for words and their amounts
	typedef std::map<std::string, int> Dict;


	// comparable element for frequency dictionary
	// purpose is more clear than std::pair<int, std::string>
	struct OrderPair {
	int count;
	std::string word;
	};


	// comparison function for ordered set
	// sort them by count in reverse order, then by word
	bool operator<(const OrderPair& a, const OrderPair& b) {
	if (a.count > b.count) return true; // this is reverse order!
	if (b.count > a.count) return false;
	if (a.word < b.word) return true;
	return false;
	}


	// let's make bunch of word+count entries automagically sorted
	typedef std::set<OrderPair> Freq;


	// return next word from input stream
	// or empty string if there were no words left
	// function retrieves only words from [a-z] alphabet
	// (may be it will extended to russian codeset but not now)
	std::string next_word(std::istream& is) {
	int ch;
	std::ostringstream buf;
	do { // at first, search for a start of word
	if (is.eof()) return std::string();
	ch = ::tolower(is.get());
	} while (!::isalpha(ch));
	do {
	buf << static_cast<char>(ch);
	ch = ::tolower(is.get());
	} while (!is.eof() && ::isalpha(ch));
	return buf.str();
	}


	// retrieve and count words from input stream
	void collect(Dict& dict, std::istream& is) {
	std::string word;
	// retrieving and counting words from the input stream
	for (;;) {
	word = next_word(is);
	if (word.empty()) break;
	if (word.size() < MIN_WORD_SIZE) continue; // skip if too small
	Dict::iterator i = dict.find(word);
	if (i == dict.end()) {
	dict.insert({word, 1});
	} else {
	(*i).second++;
	}
	}
	}


	// create ordered dataset
	void make_freq(Freq& freq, Dict& dict) {
	// sorting collected words with counts in desired order
	for (Dict::iterator i = dict.begin(); i != dict.end(); ++i) {
	freq.insert({(i).second, (i).first});
	}
	}


	// print dictionary
	void printout(std::ostream& os, Freq& freq) {
	for (Freq::iterator i = freq.begin(); i != freq.end(); ++i) {
	os << (i).count << " " << (i).word << std::endl;
	}
	}


	// perform test without external file
	void test() {
	std::istringstream is(text);
	Dict dict;
	Freq freq;
	collect(dict, is);
	make_freq(freq, dict);
	printout(std::cout, freq);
	}


	int main(int argc, char ** argv) {
	if (argc <= 1) { // some sort of self-test. can be zero on MVS
	test();
	} else {
	Dict dict;
	Freq freq;
	// we will collect data from all input files
	// to one dictionary
	for (int i = 1; i < argc-1; i++) {
	std::ifstream is(argv[i]);
	if (is.is_open()) {
	collect(dict, is);
	} else {
	std::cerr << "file open error: "
	<< argv[i] << std::endl;
	}
	}
	make_freq(freq, dict);
	std::ofstream os(argv[argc-1]);
	printout(os, freq);
	}
	return 0;
	}