Skip to content

Instantly share code, notes, and snippets.

@PhDP
Last active December 20, 2015 07:59
Show Gist options
  • Save PhDP/6097444 to your computer and use it in GitHub Desktop.
Save PhDP/6097444 to your computer and use it in GitHub Desktop.
Rough draft of a stemmer and english stemmer classes
/////////////////////////
// stemmer.hh //
/////////////////////////
#pragma once
#include <iostream>
#include <string>
/**
* Abstract class for stemmers.
*/
class stemmer {
public:
/**
* Return the original string.
*/
virtual std::string get_original() const = 0;
/**
* Return the word after stemming.
*/
virtual std::string get_stem() const = 0;
/**
* True if the strings after and before stemming are the same.
*/
virtual bool same() const = 0;
};
/////////////////////////
// stemmer_en.hh //
/////////////////////////
#pragma once
#include <iostream>
#include <string>
#include <list>
#include <boost/algorithm/string.hpp>
#include "stemmer.hh"
/**
* Stemmer for English words based on the Porter algorithm.
*/
class stemmer_en : public stemmer {
private:
const std::string m_original; // The original string.
std::string m_stem; // The stemmed string.
static std::list<std::pair<std::string, std::string>> m_step1b;
static std::list<std::pair<std::string, std::string>> m_step2;
static std::list<std::pair<std::string, std::string>> m_step3;
static std::list<std::pair<std::string, std::string>> m_step4;
public:
/**
* The stemmer is a lightweight object that stores the original
* word and its stem. The Porter algorithm to find the stem is
* called directly in the constructor.
*/
stemmer_en(std::string word);
~stemmer_en();
std::string get_original() const;
std::string get_stem() const;
bool same() const;
/** To read the stem. */
char operator[](unsigned int idx) const;
/** Returns true for the main vowels. */
static bool main_vowel(char c);
/** Returns true if the letter at the given index is a vowel. */
static bool vowel(const std::string &s, int idx);
/** Returns true if the letter at the given index is a consonant. */
static bool consonant(const std::string &s, int idx);
/** Returns true if the word has at least one vowel. */
static bool has_vowel(const std::string &s);
/**
* Returns true if the word ends with double consonants.
*/
static bool double_consonants(const std::string &s);
/**
* Implementation of the 'measure' as defined in the Porter algorithm.
*
* The formal definition (overtly complicated) can be found on the official
* website, but I define it as: the number of times a vowel is followed by
* a consonant, e.g.:
* Tree = 0 Orc = 1 Obama = 2 Treason = 2 Minotaur = 3
* CCVV VCC VCVCV CCVVCVC CVCVCVVC
*/
static unsigned int measure(std::string s);
/** Tests if 'w' ends with the substring 'end'. */
static bool ends_with(const std::string &w, const std::string &end);
private:
/**
* Takes a list of pairs (I don't use a map to keep the ordering) plus a
* minimum measure value for the stem. Returns true if mapping occurred.
*/
bool replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m);
/**
* The weird o-rule:
* "The stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP)."
*/
bool o_rule(const std::string &stem) const;
// Steps 1 to 5:
void step1a();
void step1b();
void step1b_pt2();
void step1c();
void step1();
void step2();
void step3();
void step4();
void step5a();
void step5b();
void step5();
void stemming();
};
std::list<std::pair<std::string, std::string>> stemmer_en::m_step1b
{{"at", "ate"},
{"bl", "ble"},
{"iz", "ize"}};
std::list<std::pair<std::string, std::string>> stemmer_en::m_step2
{{"ational", "ate"},
{"tional", "tion"},
{"enci", "ence"},
{"anci", "ance"},
{"izer", "ize"},
{"bli", "ble"},
{"alli", "al"},
{"entli", "ent"},
{"eli", "e"},
{"ousli", "ous"},
{"ization", "ize"},
{"ation", "ate"},
{"ator", "ate"},
{"alism", "al"},
{"iveness", "ive"},
{"fulness", "ful"},
{"ousness", "ous"},
{"aliti", "al"},
{"iviti", "ive"},
{"biliti", "ble"},
{"logi", "log"}};
std::list<std::pair<std::string, std::string>> stemmer_en::m_step3
{{"icate", "ic"},
{"ative", ""},
{"alize", "al"},
{"iciti", "ic"},
{"ical", "ic"},
{"ful", ""},
{"ness", ""}};
std::list<std::pair<std::string, std::string>> stemmer_en::m_step4
{{"al", ""},
{"ance", ""},
{"ence", ""},
{"er", ""},
{"ic", ""},
{"able", ""},
{"ible", ""},
{"ant", ""},
{"ement", ""},
{"ment", ""},
{"ent", ""},
{"ou", ""},
{"ism", ""},
{"ate", ""},
{"iti", ""},
{"ous", ""},
{"ive", ""},
{"ize", ""}};
stemmer_en::stemmer_en(std::string word) : m_original(word), m_stem(word) {
boost::algorithm::to_lower(m_stem);
if (word.length() > 2) {
stemming();
}
}
stemmer_en::~stemmer_en() {
// ...
}
std::string stemmer_en::get_original() const {
return m_original;
}
std::string stemmer_en::get_stem() const {
return m_stem;
}
bool stemmer_en::same() const {
return m_original == m_stem;
}
char stemmer_en::operator[](unsigned int idx) const {
return m_stem[idx];
}
bool stemmer_en::main_vowel(char c) {
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}
bool stemmer_en::vowel(const std::string &s, int idx) {
return main_vowel(s[idx]) || (idx > 0 && s[idx] == 'y' && !vowel(s, idx-1));
}
bool stemmer_en::consonant(const std::string &s, int idx) {
return !vowel(s, idx);
}
bool stemmer_en::has_vowel(const std::string &s) {
for (int i = 0; i < s.length(); ++i) {
if (vowel(s, i)) {
return true;
}
}
return false;
}
bool stemmer_en::double_consonants(const std::string &s) {
const unsigned int x = s.length() - 1;
return consonant(s, x) && (s[x] == s[x-1]);
}
unsigned int stemmer_en::measure(std::string s) {
boost::algorithm::to_lower(s);
unsigned int m = 0;
bool prev = vowel(s, 0);
for (int i = 1; i < s.length(); ++i) {
const bool curr = vowel(s, i);
if (prev && !curr) {
++m;
}
prev = curr;
}
return m;
}
bool stemmer_en::ends_with(const std::string &w, const std::string &end) {
const size_t n0 = w.length();
const size_t n1 = end.length();
if (n0 < n1) {
return false;
}
for (int i = 0; i < n1; ++i) {
if (w[i + n0 - n1] != end[i]) {
return false;
}
}
return true;
}
bool stemmer_en::replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m) {
const size_t n0 = m_stem.length();
for (const auto &w : mappings) {
const size_t n1 = w.first.length();
if (ends_with(m_stem, w.first) && measure(m_stem.substr(0, n0 - n1)) > m) {
const size_t nr = w.second.length();
const int delta = n1 - nr;
if (delta >= 0) { // n1 >= nr
m_stem.replace(n0 - n1, nr, w.second);
m_stem.erase(n0 - delta, delta);
} else { // n1 < nr
m_stem = m_stem.substr(0, n0 - n1) + w.second;
}
return true;
}
}
return false;
}
bool stemmer_en::o_rule(const std::string &stem) const {
const size_t n = stem.length();
if (n >= 3 && vowel(stem, n-2) && consonant(stem, n-3)) {
const char c = stem[n-1];
return consonant(stem, n-1) && c != 'w' && c != 'x' && c != 'y';
}
return false;
}
void stemmer_en::step1a() {
const size_t n = m_stem.length();
if (m_stem[n-1] == 's') {
if (m_stem[n-2] != 's') {
m_stem.erase(n - 1, 1);
} else if (m_stem[n-2] == 'e') {
if (m_stem[n-3] == 'i' || (m_stem[n-3] == 's' && m_stem[n-4] == 's')) {
m_stem.erase(n - 2, 2);
} else {
m_stem.erase(n - 1, 1);
}
}
}
}
void stemmer_en::step1b() {
const size_t n = m_stem.length();
if (n > 3 && m_stem[n-1] == 'd' && m_stem[n-2] == 'e' && m_stem[n-3] == 'e') {
if (measure(m_stem.substr(0, n-3)) > 0) {
m_stem.erase(n - 1, 1);
}
} else if (m_stem[n-1] == 'd' && m_stem[n-2] == 'e') {
if (has_vowel(m_stem.substr(0, n-2))) {
m_stem.erase(n - 2, 2);
step1b_pt2();
}
} else if (m_stem[n-1] == 'g' && m_stem[n-2] == 'n' && m_stem[n-3] == 'i') {
if (has_vowel(m_stem.substr(0, n-3))) {
m_stem.erase(n - 3, 3);
step1b_pt2();
}
}
}
void stemmer_en::step1b_pt2() {
const bool replaced = replace_list(m_step1b, -1);
if (!replaced) {
const size_t n = m_stem.length();
if (double_consonants(m_stem) && m_stem[n-1] != 'l' && m_stem[n-1] != 's' && m_stem[n-1] != 'z') {
m_stem.erase(n - 1, 1);
} else if (measure(m_stem) == 1 && o_rule(m_stem)) {
m_stem += 'e';
}
}
}
void stemmer_en::step1c() {
const size_t n = m_stem.length();
if (m_stem[n-1] == 'y' && has_vowel(m_stem.substr(0, n - 2))) {
m_stem[n-1] = 'i';
}
}
void stemmer_en::step1() {
step1a();
step1b();
step1c();
}
void stemmer_en::step2() {
replace_list(m_step2, 0);
}
void stemmer_en::step3() {
replace_list(m_step3, 0);
}
void stemmer_en::step4() {
const bool replaced = replace_list(m_step4, 1);
if (!replaced && ends_with(m_stem, "ion")) {
std::string s = m_stem.substr(0, m_stem.length() - 3);
const char c = s[s.length() -1];
if (measure(s) > 1 && (c == 's' || c == 't')) {
m_stem.erase(m_stem.length() - 3, 3);
}
}
}
void stemmer_en::step5a() {
const size_t n = m_stem.length();
if (m_stem[n-1] == 'e') {
const std::string stem = m_stem.substr(0, n-1);
const unsigned int m = measure(stem);
if (m > 1 || ((m == 1) && !o_rule(stem))) {
m_stem.erase(n - 1, 1);
}
}
}
void stemmer_en::step5b() {
const size_t n = m_stem.length();
if (measure(m_stem) > 1 && m_stem[n-1] == 'l' && m_stem[n-2] == 'l') {
m_stem.erase(n - 1, 1);
}
}
void stemmer_en::step5() {
step5a();
step5b();
}
void stemmer_en::stemming() {
step1();
step2();
step3();
step4();
step5();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment