Created
March 19, 2013 00:14
-
-
Save tbl3rd/5192339 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tbl.local # ls | |
stl.cc | |
tbl.local # cat stl.cc | |
// C++ has strong support for internationalization and localization, and | |
// it is better developed than the C equivalents. Here's a small sample | |
// program illustrating some techniques. This program is pure ANSI/ISO | |
// standard C++. It should compile and run on any conforming hosted C++ | |
// implementation on any OS. If you want it to run on a platform that | |
// uses wide characters for its native data representation, just change | |
// 'std::string' to 'std::wstring' in main(). It parses words and frobs | |
// case according to locale. In C++ it is fairly simple to have multiple | |
// locales active in a process too -- so-called "non-global locales". | |
// | |
// Notice that this program's whole reason for being is to manipulate | |
// characters and strings, but it doesn't mention any char or string | |
// types except in main(). | |
// | |
// I wanted to parameterize on character -- rather than string -- type, | |
// but bugs in the trait classes in the MSVC++ standard library prevent | |
// that. So instead, I parameterize string type and use its value_type | |
// as the character type. Only main() knows the character/string type. | |
// | |
// This program is basically the old Unix scripting example | |
// | |
// cat "$@" | | |
// tr -cs 'A-Za-z' '\012' | | |
// sed '/^$/d' | | |
// tr 'A-Z' 'a-z' | | |
// sort | | |
// uniq -c | | |
// sort -n -r | |
// | |
// written in C++ and abstracted over character types | |
// and locales. It counts word frequency in a list of | |
// files, and prints a table sorted from most frequent | |
// to least. | |
// #pragma warning(disable: 4786) | |
#include <algorithm> | |
#include <iostream> | |
#include <iterator> | |
#include <fstream> | |
#include <locale> | |
#include <map> | |
#include <sstream> | |
#include <vector> | |
namespace { | |
// Convert a Y(X) map into a X(Y) multimap in reverse insertion order. | |
// | |
template<class Multi, class Map> | |
inline void invertMapReverse(Multi &u, const Map &m) | |
{ | |
typedef typename Map::const_reverse_iterator MCRI; | |
MCRI const end = m.rend(); | |
typename Multi::iterator hint = u.begin(); | |
for (MCRI p = m.rbegin(); p != end; ++p) { | |
hint = u.insert(hint, typename Multi::value_type(p->second, p->first)); | |
} | |
} | |
// Show the std::pair p on s as "first second\n" | |
// | |
template<class Pair, class Stream> class ShowPair { | |
Stream &itsStream; | |
public: | |
void operator()(const Pair &p) { | |
itsStream << p.first << ' ' << p.second << '\n'; | |
} | |
ShowPair(Stream &s): itsStream(s) {} | |
}; | |
// Return the locale for this process. | |
// | |
inline const std::locale &theLocale() | |
{ | |
static const std::locale theLocale; | |
return theLocale; | |
} | |
// Collect into wtc any word remaining in s after stripping punctuation. | |
// | |
template<class WordsToCounts> struct CollectOneWord { | |
typedef typename WordsToCounts::key_type StringT; | |
typedef typename StringT::value_type CharT; | |
typedef typename StringT::const_iterator ConstIterator; | |
private: | |
WordsToCounts &itsWtc; | |
static bool isAlNum(CharT c) { return std::isalnum(c, theLocale()); } | |
static bool isNotAlNum(CharT c) { return !isAlNum(c); } | |
public: | |
void operator()(const StringT &s) { | |
ConstIterator b = std::find_if(s.begin(), s.end(), isAlNum); | |
ConstIterator e = std::find_if(b, s.end(), isNotAlNum); | |
const StringT w(b, e); | |
if (!w.empty()) ++itsWtc[w]; | |
} | |
CollectOneWord(WordsToCounts &wtc): itsWtc(wtc) {} | |
}; | |
// Collect a word from s after downcasing it. | |
// | |
template<class WordsToCounts> class CollectFromLowercaseString { | |
typedef typename WordsToCounts::key_type StringT; | |
typedef typename StringT::value_type CharT; | |
CollectOneWord<WordsToCounts> itsCow; | |
static CharT toLowercase(CharT c) { return std::tolower(c, theLocale()); } | |
public: | |
void operator()(const StringT &s) { | |
std::basic_ostringstream<CharT> os; | |
std::ostream_iterator<CharT> osi(os); | |
std::transform(s.begin(), s.end(), osi, toLowercase); | |
itsCow(os.str()); | |
} | |
CollectFromLowercaseString(WordsToCounts &wtc): itsCow(wtc) {} | |
}; | |
// Collect all words from the file at pathname into wtc. | |
// | |
template<class WordsToCounts> class CollectAllWordsFromFile { | |
typedef typename WordsToCounts::key_type StringT; | |
typedef typename StringT::value_type CharT; | |
CollectFromLowercaseString<WordsToCounts> itsCollector; | |
public: | |
void operator()(const char *pathname) { | |
std::basic_ifstream<CharT> inputFile(pathname); | |
if (inputFile) { | |
std::istream_iterator<StringT> begin(inputFile), end; | |
std::for_each(begin, end, itsCollector); | |
} else { | |
throw pathname; | |
} | |
} | |
CollectAllWordsFromFile(WordsToCounts &wtc): itsCollector(wtc) {} | |
}; | |
// Represent (ac, av) as a container and validate it. | |
// | |
class CommandLine { | |
int itsAc; | |
char **itsAv; | |
public: | |
char **begin() const { return itsAv + 1; } | |
char **end() const { return itsAv + itsAc; } | |
const char *name() const { return itsAv[0]; } | |
void showUsage() const { | |
std::cerr << "Usage: " << name() << " [<file> ...]" << std::endl; | |
} | |
bool validate() const { | |
const bool ok = itsAc > 1; | |
if (!ok) showUsage(); | |
return ok; | |
} | |
CommandLine(int ac, char **av): itsAc(ac), itsAv(av) {} | |
}; | |
// Count the frequency of words occuring in files named on cl. | |
// | |
template<class String, class Stream> | |
inline void countWordFrequency(const CommandLine &cl, Stream &outS) | |
{ | |
typedef std::map<String, int, std::less<String> > WordsToCounts; | |
typedef std::multimap<int, String, std::less<int> > CountsToWords; | |
WordsToCounts wordsIn; | |
CollectAllWordsFromFile<WordsToCounts> caw(wordsIn); | |
std::for_each(cl.begin(), cl.end(), caw); | |
CountsToWords wordsOut; | |
invertMapReverse(wordsOut, wordsIn); | |
ShowPair<typename CountsToWords::value_type, Stream> showPair(outS); | |
std::for_each(wordsOut.rbegin(), wordsOut.rend(), showPair); | |
} | |
} | |
int main(int ac, char *av[]) | |
{ | |
CommandLine cl(ac, av); | |
bool ok = cl.validate(); | |
try { | |
if (ok) countWordFrequency<std::string>(cl, std::cout); | |
} catch (const char *pathname) { | |
std::cerr << cl.name() << ": cannot open file named \"" | |
<< pathname << "\"." << std::endl; | |
cl.showUsage(); | |
ok = false; | |
} | |
return ok? EXIT_SUCCESS: EXIT_FAILURE; | |
} | |
// The CommandLine class is a minimal STL-style container -- implemented | |
// only enough to use conveniently with for_each(). | |
// | |
// invertMapReverse() is a new "generic algorithm" that inverts the | |
// mapping relation in any "associative container". It does it in | |
// linear time too! | |
tbl.local # make stl | |
g++ stl.cc -o stl | |
tbl.local # ./stl stl.cc | head | |
26 std | |
17 the | |
15 a | |
13 it | |
13 in | |
13 const | |
11 typename | |
11 and | |
10 typedef | |
10 stringt | |
tbl.local # |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment