Last active
December 20, 2015 07:59
-
-
Save PhDP/6097444 to your computer and use it in GitHub Desktop.
Rough draft of a stemmer and english stemmer classes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
///////////////////////// | |
// stemmer.hh // | |
///////////////////////// | |
#pragma once | |
#include <iostream> | |
#include <string> | |
/** | |
* Abstract class for stemmers. | |
*/ | |
class stemmer { | |
public: | |
/** | |
* Return the original string. | |
*/ | |
virtual std::string get_original() const = 0; | |
/** | |
* Return the word after stemming. | |
*/ | |
virtual std::string get_stem() const = 0; | |
/** | |
* True if the strings after and before stemming are the same. | |
*/ | |
virtual bool same() const = 0; | |
}; | |
///////////////////////// | |
// stemmer_en.hh // | |
///////////////////////// | |
#pragma once | |
#include <iostream> | |
#include <string> | |
#include <list> | |
#include <boost/algorithm/string.hpp> | |
#include "stemmer.hh" | |
/** | |
* Stemmer for English words based on the Porter algorithm. | |
*/ | |
class stemmer_en : public stemmer { | |
private: | |
const std::string m_original; // The original string. | |
std::string m_stem; // The stemmed string. | |
static std::list<std::pair<std::string, std::string>> m_step1b; | |
static std::list<std::pair<std::string, std::string>> m_step2; | |
static std::list<std::pair<std::string, std::string>> m_step3; | |
static std::list<std::pair<std::string, std::string>> m_step4; | |
public: | |
/** | |
* The stemmer is a lightweight object that stores the original | |
* word and its stem. The Porter algorithm to find the stem is | |
* called directly in the constructor. | |
*/ | |
stemmer_en(std::string word); | |
~stemmer_en(); | |
std::string get_original() const; | |
std::string get_stem() const; | |
bool same() const; | |
/** To read the stem. */ | |
char operator[](unsigned int idx) const; | |
/** Returns true for the main vowels. */ | |
static bool main_vowel(char c); | |
/** Returns true if the letter at the given index is a vowel. */ | |
static bool vowel(const std::string &s, int idx); | |
/** Returns true if the letter at the given index is a consonant. */ | |
static bool consonant(const std::string &s, int idx); | |
/** Returns true if the word has at least one vowel. */ | |
static bool has_vowel(const std::string &s); | |
/** | |
* Returns true if the word ends with double consonants. | |
*/ | |
static bool double_consonants(const std::string &s); | |
/** | |
* Implementation of the 'measure' as defined in the Porter algorithm. | |
* | |
* The formal definition (overtly complicated) can be found on the official | |
* website, but I define it as: the number of times a vowel is followed by | |
* a consonant, e.g.: | |
* Tree = 0 Orc = 1 Obama = 2 Treason = 2 Minotaur = 3 | |
* CCVV VCC VCVCV CCVVCVC CVCVCVVC | |
*/ | |
static unsigned int measure(std::string s); | |
/** Tests if 'w' ends with the substring 'end'. */ | |
static bool ends_with(const std::string &w, const std::string &end); | |
private: | |
/** | |
* Takes a list of pairs (I don't use a map to keep the ordering) plus a | |
* minimum measure value for the stem. Returns true if mapping occurred. | |
*/ | |
bool replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m); | |
/** | |
* The weird o-rule: | |
* "The stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP)." | |
*/ | |
bool o_rule(const std::string &stem) const; | |
// Steps 1 to 5: | |
void step1a(); | |
void step1b(); | |
void step1b_pt2(); | |
void step1c(); | |
void step1(); | |
void step2(); | |
void step3(); | |
void step4(); | |
void step5a(); | |
void step5b(); | |
void step5(); | |
void stemming(); | |
}; | |
std::list<std::pair<std::string, std::string>> stemmer_en::m_step1b | |
{{"at", "ate"}, | |
{"bl", "ble"}, | |
{"iz", "ize"}}; | |
std::list<std::pair<std::string, std::string>> stemmer_en::m_step2 | |
{{"ational", "ate"}, | |
{"tional", "tion"}, | |
{"enci", "ence"}, | |
{"anci", "ance"}, | |
{"izer", "ize"}, | |
{"bli", "ble"}, | |
{"alli", "al"}, | |
{"entli", "ent"}, | |
{"eli", "e"}, | |
{"ousli", "ous"}, | |
{"ization", "ize"}, | |
{"ation", "ate"}, | |
{"ator", "ate"}, | |
{"alism", "al"}, | |
{"iveness", "ive"}, | |
{"fulness", "ful"}, | |
{"ousness", "ous"}, | |
{"aliti", "al"}, | |
{"iviti", "ive"}, | |
{"biliti", "ble"}, | |
{"logi", "log"}}; | |
std::list<std::pair<std::string, std::string>> stemmer_en::m_step3 | |
{{"icate", "ic"}, | |
{"ative", ""}, | |
{"alize", "al"}, | |
{"iciti", "ic"}, | |
{"ical", "ic"}, | |
{"ful", ""}, | |
{"ness", ""}}; | |
std::list<std::pair<std::string, std::string>> stemmer_en::m_step4 | |
{{"al", ""}, | |
{"ance", ""}, | |
{"ence", ""}, | |
{"er", ""}, | |
{"ic", ""}, | |
{"able", ""}, | |
{"ible", ""}, | |
{"ant", ""}, | |
{"ement", ""}, | |
{"ment", ""}, | |
{"ent", ""}, | |
{"ou", ""}, | |
{"ism", ""}, | |
{"ate", ""}, | |
{"iti", ""}, | |
{"ous", ""}, | |
{"ive", ""}, | |
{"ize", ""}}; | |
stemmer_en::stemmer_en(std::string word) : m_original(word), m_stem(word) { | |
boost::algorithm::to_lower(m_stem); | |
if (word.length() > 2) { | |
stemming(); | |
} | |
} | |
stemmer_en::~stemmer_en() { | |
// ... | |
} | |
std::string stemmer_en::get_original() const { | |
return m_original; | |
} | |
std::string stemmer_en::get_stem() const { | |
return m_stem; | |
} | |
bool stemmer_en::same() const { | |
return m_original == m_stem; | |
} | |
char stemmer_en::operator[](unsigned int idx) const { | |
return m_stem[idx]; | |
} | |
bool stemmer_en::main_vowel(char c) { | |
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; | |
} | |
bool stemmer_en::vowel(const std::string &s, int idx) { | |
return main_vowel(s[idx]) || (idx > 0 && s[idx] == 'y' && !vowel(s, idx-1)); | |
} | |
bool stemmer_en::consonant(const std::string &s, int idx) { | |
return !vowel(s, idx); | |
} | |
bool stemmer_en::has_vowel(const std::string &s) { | |
for (int i = 0; i < s.length(); ++i) { | |
if (vowel(s, i)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
bool stemmer_en::double_consonants(const std::string &s) { | |
const unsigned int x = s.length() - 1; | |
return consonant(s, x) && (s[x] == s[x-1]); | |
} | |
unsigned int stemmer_en::measure(std::string s) { | |
boost::algorithm::to_lower(s); | |
unsigned int m = 0; | |
bool prev = vowel(s, 0); | |
for (int i = 1; i < s.length(); ++i) { | |
const bool curr = vowel(s, i); | |
if (prev && !curr) { | |
++m; | |
} | |
prev = curr; | |
} | |
return m; | |
} | |
bool stemmer_en::ends_with(const std::string &w, const std::string &end) { | |
const size_t n0 = w.length(); | |
const size_t n1 = end.length(); | |
if (n0 < n1) { | |
return false; | |
} | |
for (int i = 0; i < n1; ++i) { | |
if (w[i + n0 - n1] != end[i]) { | |
return false; | |
} | |
} | |
return true; | |
} | |
bool stemmer_en::replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m) { | |
const size_t n0 = m_stem.length(); | |
for (const auto &w : mappings) { | |
const size_t n1 = w.first.length(); | |
if (ends_with(m_stem, w.first) && measure(m_stem.substr(0, n0 - n1)) > m) { | |
const size_t nr = w.second.length(); | |
const int delta = n1 - nr; | |
if (delta >= 0) { // n1 >= nr | |
m_stem.replace(n0 - n1, nr, w.second); | |
m_stem.erase(n0 - delta, delta); | |
} else { // n1 < nr | |
m_stem = m_stem.substr(0, n0 - n1) + w.second; | |
} | |
return true; | |
} | |
} | |
return false; | |
} | |
bool stemmer_en::o_rule(const std::string &stem) const { | |
const size_t n = stem.length(); | |
if (n >= 3 && vowel(stem, n-2) && consonant(stem, n-3)) { | |
const char c = stem[n-1]; | |
return consonant(stem, n-1) && c != 'w' && c != 'x' && c != 'y'; | |
} | |
return false; | |
} | |
void stemmer_en::step1a() { | |
const size_t n = m_stem.length(); | |
if (m_stem[n-1] == 's') { | |
if (m_stem[n-2] != 's') { | |
m_stem.erase(n - 1, 1); | |
} else if (m_stem[n-2] == 'e') { | |
if (m_stem[n-3] == 'i' || (m_stem[n-3] == 's' && m_stem[n-4] == 's')) { | |
m_stem.erase(n - 2, 2); | |
} else { | |
m_stem.erase(n - 1, 1); | |
} | |
} | |
} | |
} | |
void stemmer_en::step1b() { | |
const size_t n = m_stem.length(); | |
if (n > 3 && m_stem[n-1] == 'd' && m_stem[n-2] == 'e' && m_stem[n-3] == 'e') { | |
if (measure(m_stem.substr(0, n-3)) > 0) { | |
m_stem.erase(n - 1, 1); | |
} | |
} else if (m_stem[n-1] == 'd' && m_stem[n-2] == 'e') { | |
if (has_vowel(m_stem.substr(0, n-2))) { | |
m_stem.erase(n - 2, 2); | |
step1b_pt2(); | |
} | |
} else if (m_stem[n-1] == 'g' && m_stem[n-2] == 'n' && m_stem[n-3] == 'i') { | |
if (has_vowel(m_stem.substr(0, n-3))) { | |
m_stem.erase(n - 3, 3); | |
step1b_pt2(); | |
} | |
} | |
} | |
void stemmer_en::step1b_pt2() { | |
const bool replaced = replace_list(m_step1b, -1); | |
if (!replaced) { | |
const size_t n = m_stem.length(); | |
if (double_consonants(m_stem) && m_stem[n-1] != 'l' && m_stem[n-1] != 's' && m_stem[n-1] != 'z') { | |
m_stem.erase(n - 1, 1); | |
} else if (measure(m_stem) == 1 && o_rule(m_stem)) { | |
m_stem += 'e'; | |
} | |
} | |
} | |
void stemmer_en::step1c() { | |
const size_t n = m_stem.length(); | |
if (m_stem[n-1] == 'y' && has_vowel(m_stem.substr(0, n - 2))) { | |
m_stem[n-1] = 'i'; | |
} | |
} | |
void stemmer_en::step1() { | |
step1a(); | |
step1b(); | |
step1c(); | |
} | |
void stemmer_en::step2() { | |
replace_list(m_step2, 0); | |
} | |
void stemmer_en::step3() { | |
replace_list(m_step3, 0); | |
} | |
void stemmer_en::step4() { | |
const bool replaced = replace_list(m_step4, 1); | |
if (!replaced && ends_with(m_stem, "ion")) { | |
std::string s = m_stem.substr(0, m_stem.length() - 3); | |
const char c = s[s.length() -1]; | |
if (measure(s) > 1 && (c == 's' || c == 't')) { | |
m_stem.erase(m_stem.length() - 3, 3); | |
} | |
} | |
} | |
void stemmer_en::step5a() { | |
const size_t n = m_stem.length(); | |
if (m_stem[n-1] == 'e') { | |
const std::string stem = m_stem.substr(0, n-1); | |
const unsigned int m = measure(stem); | |
if (m > 1 || ((m == 1) && !o_rule(stem))) { | |
m_stem.erase(n - 1, 1); | |
} | |
} | |
} | |
void stemmer_en::step5b() { | |
const size_t n = m_stem.length(); | |
if (measure(m_stem) > 1 && m_stem[n-1] == 'l' && m_stem[n-2] == 'l') { | |
m_stem.erase(n - 1, 1); | |
} | |
} | |
void stemmer_en::step5() { | |
step5a(); | |
step5b(); | |
} | |
void stemmer_en::stemming() { | |
step1(); | |
step2(); | |
step3(); | |
step4(); | |
step5(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment