Last active
December 20, 2015 07:29
-
-
Save PhDP/6093357 to your computer and use it in GitHub Desktop.
The measure as defined in the Porter algorithm.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Computes the measure of an English word as defined for the Porter algorithm. | |
* The definition of the measure can be found here: | |
* http://snowball.tartarus.org/algorithms/porter/stemmer.html | |
* | |
* ...but it's overtly complicated. Here's my definition: | |
* | |
* The *measure* of a word is the number of vowels followed by a consonant. | |
* | |
* Examples: | |
* Tree = 0 Orc = 1 Obama = 2 Treason = 2 | |
* CCVV VCC VCVCV CCVVCVC | |
* | |
* Usage | |
* ----- | |
* Supply a single argument (an English word) to get its measure. Run with | |
* no arguments for the tests. | |
* | |
* Compilation | |
* ----------- | |
* clang++ -O3 -std=c++11 measure.cc -o measure | |
*/ | |
#include <iostream> | |
#include <string> | |
#include <map> | |
#include <boost/algorithm/string.hpp> | |
using namespace std; | |
bool main_vowel(char c) { | |
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; | |
} | |
bool vowel(const string &s, int idx) { | |
return main_vowel(s[idx]) || (idx > 0 && s[idx] == 'y' && !vowel(s, idx-1)); | |
} | |
bool consonant(const string &s, int idx) { | |
return !vowel(s, idx); | |
} | |
unsigned int measure(string s) { | |
boost::algorithm::to_lower(s); | |
unsigned int m = 0; | |
bool prev = vowel(s, 0); | |
for (int i = 1; i < s.length(); ++i) { | |
const bool curr = vowel(s, i); | |
if (prev && !curr) { | |
++m; | |
} | |
prev = curr; | |
} | |
return m; | |
} | |
int main(int argc, char **argv) { | |
if (argc == 1) { | |
cout << "Running tests...\n"; | |
map<string, unsigned int> words | |
{{"Tr", 0}, {"ee", 0}, {"TREE", 0}, {"y", 0}, {"BY", 0}, | |
{"trouble", 1}, {"oats", 1}, {"trees", 1}, {"ivy", 1}, | |
{"Troubles", 2}, {"private", 2}, {"OATEN", 2}, {"orrery", 2}}; | |
for (auto &x : words) { | |
const unsigned int m = measure(x.first); | |
cout << x.first << ": " << m; | |
if (m != x.second) { | |
cout << " -> WRONG!! Correct answer: " << x.second << endl; | |
return 42; | |
} | |
cout << '\n'; | |
} | |
cout << "All good!" << endl; | |
} else { | |
string word{argv[1]}; | |
cout << measure(word) << endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment