PhDP · December 20, 2015 07:59
diff --git a/stemmers.cc b/stemmers.cc
 /////////////////////////
 // stemmer.hh          //
 /////////////////////////

 #pragma once

 #include <iostream>
 #include <string>

 /**
 * Abstract class for stemmers.
 */
 class stemmer {
 public:
    /**
     * Return the original string.
     */
    virtual std::string get_original() const = 0;

    /**
     * Return the word after stemming.
     */
    virtual std::string get_stem() const = 0;

    /**
     * True if the strings after and before stemming are the same.
     */
    virtual bool same() const = 0;
 };

 /////////////////////////
 // stemmer_en.hh       //
 /////////////////////////

 #pragma once

 #include <iostream>
 #include <string>
 #include <list>
 #include <boost/algorithm/string.hpp>
 #include "stemmer.hh"

 /**
 * Stemmer for English words based on the Porter algorithm.
 */
 class stemmer_en : public stemmer {
 private:
    const std::string m_original; // The original string.
    std::string m_stem; // The stemmed string.

    static std::list<std::pair<std::string, std::string>> m_step1b;
    static std::list<std::pair<std::string, std::string>> m_step2;
    static std::list<std::pair<std::string, std::string>> m_step3;
    static std::list<std::pair<std::string, std::string>> m_step4;

 public:
    /**
     * The stemmer is a lightweight object that stores the original
     * word and its stem. The Porter algorithm to find the stem is
     * called directly in the constructor.
     */
    stemmer_en(std::string word);

    ~stemmer_en();

    std::string get_original() const;

    std::string get_stem() const;

    bool same() const;

    /** To read the stem. */
    char operator[](unsigned int idx) const;

    /** Returns true for the main vowels. */
    static bool main_vowel(char c);

    /** Returns true if the letter at the given index is a vowel. */
    static bool vowel(const std::string &s, int idx);

    /** Returns true if the letter at the given index is a consonant. */
    static bool consonant(const std::string &s, int idx);

    /** Returns true if the word has at least one vowel. */
    static bool has_vowel(const std::string &s);

    /**
     * Returns true if the word ends with double consonants.
     */
    static bool double_consonants(const std::string &s);

    /**
     * Implementation of the 'measure' as defined in the Porter algorithm.
     *
     * The formal definition (overtly complicated) can be found on the official
     * website, but I define it as: the number of times a vowel is followed by
     * a consonant, e.g.:
     *    Tree = 0     Orc = 1     Obama = 2     Treason = 2     Minotaur = 3
     *    CCVV         VCC         VCVCV         CCVVCVC         CVCVCVVC
     */
    static unsigned int measure(std::string s);

    /** Tests if 'w' ends with the substring 'end'. */
    static bool ends_with(const std::string &w, const std::string &end);
 private:
    /**
     * Takes a list of pairs (I don't use a map to keep the ordering) plus a
     * minimum measure value for the stem. Returns true if mapping occurred.
     */
    bool replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m);

    /**
     * The weird o-rule:
     *    "The stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP)."
     */
    bool o_rule(const std::string &stem) const;

    // Steps 1 to 5:

    void step1a();
    void step1b();
    void step1b_pt2();
    void step1c();
    void step1();
    void step2();
    void step3();
    void step4();
    void step5a();
    void step5b();
    void step5();
    void stemming();
 };

 std::list<std::pair<std::string, std::string>> stemmer_en::m_step1b
    {{"at", "ate"},
     {"bl", "ble"},
     {"iz", "ize"}};

 std::list<std::pair<std::string, std::string>> stemmer_en::m_step2
    {{"ational", "ate"},
     {"tional", "tion"},
     {"enci", "ence"},
     {"anci", "ance"},
     {"izer", "ize"},
     {"bli", "ble"},
     {"alli", "al"},
     {"entli", "ent"},
     {"eli", "e"},
     {"ousli", "ous"},
     {"ization", "ize"},
     {"ation", "ate"},
     {"ator", "ate"},
     {"alism", "al"},
     {"iveness", "ive"},
     {"fulness", "ful"},
     {"ousness", "ous"},
     {"aliti", "al"},
     {"iviti", "ive"},
     {"biliti", "ble"},
     {"logi", "log"}};

 std::list<std::pair<std::string, std::string>> stemmer_en::m_step3
    {{"icate", "ic"},
     {"ative", ""},
     {"alize", "al"},
     {"iciti", "ic"},
     {"ical", "ic"},
     {"ful", ""},
     {"ness", ""}};

 std::list<std::pair<std::string, std::string>> stemmer_en::m_step4
    {{"al", ""},
     {"ance", ""},
     {"ence", ""},
     {"er", ""},
     {"ic", ""},
     {"able", ""},
     {"ible", ""},
     {"ant", ""},
     {"ement", ""},
     {"ment", ""},
     {"ent", ""},
     {"ou", ""},
     {"ism", ""},
     {"ate", ""},
     {"iti", ""},
     {"ous", ""},
     {"ive", ""},
     {"ize", ""}};

 stemmer_en::stemmer_en(std::string word) : m_original(word), m_stem(word) {
    boost::algorithm::to_lower(m_stem);
    if (word.length() > 2) {
        stemming();
    }
 }

 stemmer_en::~stemmer_en() {
    // ...
 }

 std::string stemmer_en::get_original() const {
    return m_original;
 }

 std::string stemmer_en::get_stem() const {
    return m_stem;
 }

 bool stemmer_en::same() const {
    return m_original == m_stem;
 }

 char stemmer_en::operator[](unsigned int idx) const {
    return m_stem[idx];
 }

 bool stemmer_en::main_vowel(char c) {
    return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
 }

 bool stemmer_en::vowel(const std::string &s, int idx) {
    return main_vowel(s[idx]) || (idx > 0 && s[idx] == 'y' && !vowel(s, idx-1));
 }

 bool stemmer_en::consonant(const std::string &s, int idx) {
    return !vowel(s, idx);
 }

 bool stemmer_en::has_vowel(const std::string &s) {
    for (int i = 0; i < s.length(); ++i) {
        if (vowel(s, i)) {
            return true;
        }
    }
    return false;
 }

 bool stemmer_en::double_consonants(const std::string &s) {
    const unsigned int x = s.length() - 1;
    return consonant(s, x) && (s[x] == s[x-1]);
 }

 unsigned int stemmer_en::measure(std::string s) {
    boost::algorithm::to_lower(s);
    unsigned int m = 0;
    bool prev = vowel(s, 0);
    for (int i = 1; i < s.length(); ++i) {
        const bool curr = vowel(s, i);
        if (prev && !curr) {
            ++m;
        }
        prev = curr;
    }
    return m;
 }

 bool stemmer_en::ends_with(const std::string &w, const std::string &end) {
    const size_t n0 = w.length();
    const size_t n1 = end.length();
    if (n0 < n1) {
        return false;
    }
    for (int i = 0; i < n1; ++i) {
        if (w[i + n0 - n1] != end[i]) {
            return false;
        }
    }
    return true;
 }

 bool stemmer_en::replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m) {
    const size_t n0 = m_stem.length();
    for (const auto &w : mappings) {
        const size_t n1 = w.first.length();
        if (ends_with(m_stem, w.first) && measure(m_stem.substr(0, n0 - n1)) > m) {
            const size_t nr = w.second.length();
            const int delta = n1 - nr;
            if (delta >= 0) { // n1 >= nr
                m_stem.replace(n0 - n1, nr, w.second);
                m_stem.erase(n0 - delta, delta);
            } else { // n1 < nr
                m_stem = m_stem.substr(0, n0 - n1) + w.second;
            }
            return true;
        }
    }
    return false;
 }

 bool stemmer_en::o_rule(const std::string &stem) const {
    const size_t n = stem.length();
    if (n >= 3 && vowel(stem, n-2) && consonant(stem, n-3)) {
        const char c = stem[n-1];
        return consonant(stem, n-1) && c != 'w' && c != 'x' && c != 'y';
    }
    return false;
 }

 void stemmer_en::step1a() {
    const size_t n = m_stem.length();
    if (m_stem[n-1] == 's') {
        if (m_stem[n-2] != 's') {
            m_stem.erase(n - 1, 1);
        } else if (m_stem[n-2] == 'e') {
            if (m_stem[n-3] == 'i' || (m_stem[n-3] == 's' && m_stem[n-4] == 's')) {
                m_stem.erase(n - 2, 2);
            } else {
                m_stem.erase(n - 1, 1);
            }
        }
    }
 }

 void stemmer_en::step1b() {
    const size_t n = m_stem.length();
    if (n > 3 && m_stem[n-1] == 'd' && m_stem[n-2] == 'e' && m_stem[n-3] == 'e') {
        if (measure(m_stem.substr(0, n-3)) > 0) {
            m_stem.erase(n - 1, 1);
        }
    } else if (m_stem[n-1] == 'd' && m_stem[n-2] == 'e') {
        if (has_vowel(m_stem.substr(0, n-2))) {
            m_stem.erase(n - 2, 2);
            step1b_pt2();
        }
    } else if (m_stem[n-1] == 'g' && m_stem[n-2] == 'n' && m_stem[n-3] == 'i') {
        if (has_vowel(m_stem.substr(0, n-3))) {
            m_stem.erase(n - 3, 3);
            step1b_pt2();
        }
    }
 }

 void stemmer_en::step1b_pt2() {
    const bool replaced = replace_list(m_step1b, -1);
    if (!replaced) {
        const size_t n = m_stem.length();
        if (double_consonants(m_stem) && m_stem[n-1] != 'l' && m_stem[n-1] != 's' && m_stem[n-1] != 'z') {
            m_stem.erase(n - 1, 1);
        } else if (measure(m_stem) == 1 && o_rule(m_stem)) {
            m_stem += 'e';
        }
    }
 }

 void stemmer_en::step1c() {
    const size_t n = m_stem.length();
    if (m_stem[n-1] == 'y' && has_vowel(m_stem.substr(0, n - 2))) {
        m_stem[n-1] = 'i';
    }
 }

 void stemmer_en::step1() {
    step1a();
    step1b();
    step1c();
 }

 void stemmer_en::step2() {
    replace_list(m_step2, 0);
 }

 void stemmer_en::step3() {
    replace_list(m_step3, 0);
 }

 void stemmer_en::step4() {
    const bool replaced = replace_list(m_step4, 1);
    if (!replaced && ends_with(m_stem, "ion")) {
        std::string s = m_stem.substr(0, m_stem.length() - 3);
        const char c = s[s.length() -1];
        if (measure(s) > 1 && (c == 's' || c == 't')) {
            m_stem.erase(m_stem.length() - 3, 3);
        }
    }
 }

 void stemmer_en::step5a() {
    const size_t n = m_stem.length();
    if (m_stem[n-1] == 'e') {
        const std::string stem = m_stem.substr(0, n-1);
        const unsigned int m = measure(stem);
        if (m > 1 || ((m == 1) && !o_rule(stem))) {
            m_stem.erase(n - 1, 1);
        }
    }
 }

 void stemmer_en::step5b() {
    const size_t n = m_stem.length();
    if (measure(m_stem) > 1 && m_stem[n-1] == 'l' && m_stem[n-2] == 'l') {
        m_stem.erase(n - 1, 1);
    }
 }

 void stemmer_en::step5() {
    step5a();
    step5b();
 }

 void stemmer_en::stemming() {
    step1();
    step2();
    step3();
    step4();
    step5();
 }
	/////////////////////////
	// stemmer.hh //
	/////////////////////////

	#pragma once

	#include <iostream>
	#include <string>

	/**
	* Abstract class for stemmers.
	*/
	class stemmer {
	public:
	/**
	* Return the original string.
	*/
	virtual std::string get_original() const = 0;

	/**
	* Return the word after stemming.
	*/
	virtual std::string get_stem() const = 0;

	/**
	* True if the strings after and before stemming are the same.
	*/
	virtual bool same() const = 0;
	};

	/////////////////////////
	// stemmer_en.hh //
	/////////////////////////

	#pragma once

	#include <iostream>
	#include <string>
	#include <list>
	#include <boost/algorithm/string.hpp>
	#include "stemmer.hh"

	/**
	* Stemmer for English words based on the Porter algorithm.
	*/
	class stemmer_en : public stemmer {
	private:
	const std::string m_original; // The original string.
	std::string m_stem; // The stemmed string.

	static std::list<std::pair<std::string, std::string>> m_step1b;
	static std::list<std::pair<std::string, std::string>> m_step2;
	static std::list<std::pair<std::string, std::string>> m_step3;
	static std::list<std::pair<std::string, std::string>> m_step4;

	public:
	/**
	* The stemmer is a lightweight object that stores the original
	* word and its stem. The Porter algorithm to find the stem is
	* called directly in the constructor.
	*/
	stemmer_en(std::string word);

	~stemmer_en();

	std::string get_original() const;

	std::string get_stem() const;

	bool same() const;

	/** To read the stem. */
	char operator[](unsigned int idx) const;

	/** Returns true for the main vowels. */
	static bool main_vowel(char c);

	/** Returns true if the letter at the given index is a vowel. */
	static bool vowel(const std::string &s, int idx);

	/** Returns true if the letter at the given index is a consonant. */
	static bool consonant(const std::string &s, int idx);

	/** Returns true if the word has at least one vowel. */
	static bool has_vowel(const std::string &s);

	/**
	* Returns true if the word ends with double consonants.
	*/
	static bool double_consonants(const std::string &s);

	/**
	* Implementation of the 'measure' as defined in the Porter algorithm.
	*
	* The formal definition (overtly complicated) can be found on the official
	* website, but I define it as: the number of times a vowel is followed by
	* a consonant, e.g.:
	* Tree = 0 Orc = 1 Obama = 2 Treason = 2 Minotaur = 3
	* CCVV VCC VCVCV CCVVCVC CVCVCVVC
	*/
	static unsigned int measure(std::string s);

	/** Tests if 'w' ends with the substring 'end'. */
	static bool ends_with(const std::string &w, const std::string &end);
	private:
	/**
	* Takes a list of pairs (I don't use a map to keep the ordering) plus a
	* minimum measure value for the stem. Returns true if mapping occurred.
	*/
	bool replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m);

	/**
	* The weird o-rule:
	* "The stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP)."
	*/
	bool o_rule(const std::string &stem) const;

	// Steps 1 to 5:

	void step1a();
	void step1b();
	void step1b_pt2();
	void step1c();
	void step1();
	void step2();
	void step3();
	void step4();
	void step5a();
	void step5b();
	void step5();
	void stemming();
	};

	std::list<std::pair<std::string, std::string>> stemmer_en::m_step1b
	{{"at", "ate"},
	{"bl", "ble"},
	{"iz", "ize"}};

	std::list<std::pair<std::string, std::string>> stemmer_en::m_step2
	{{"ational", "ate"},
	{"tional", "tion"},
	{"enci", "ence"},
	{"anci", "ance"},
	{"izer", "ize"},
	{"bli", "ble"},
	{"alli", "al"},
	{"entli", "ent"},
	{"eli", "e"},
	{"ousli", "ous"},
	{"ization", "ize"},
	{"ation", "ate"},
	{"ator", "ate"},
	{"alism", "al"},
	{"iveness", "ive"},
	{"fulness", "ful"},
	{"ousness", "ous"},
	{"aliti", "al"},
	{"iviti", "ive"},
	{"biliti", "ble"},
	{"logi", "log"}};

	std::list<std::pair<std::string, std::string>> stemmer_en::m_step3
	{{"icate", "ic"},
	{"ative", ""},
	{"alize", "al"},
	{"iciti", "ic"},
	{"ical", "ic"},
	{"ful", ""},
	{"ness", ""}};

	std::list<std::pair<std::string, std::string>> stemmer_en::m_step4
	{{"al", ""},
	{"ance", ""},
	{"ence", ""},
	{"er", ""},
	{"ic", ""},
	{"able", ""},
	{"ible", ""},
	{"ant", ""},
	{"ement", ""},
	{"ment", ""},
	{"ent", ""},
	{"ou", ""},
	{"ism", ""},
	{"ate", ""},
	{"iti", ""},
	{"ous", ""},
	{"ive", ""},
	{"ize", ""}};

	stemmer_en::stemmer_en(std::string word) : m_original(word), m_stem(word) {
	boost::algorithm::to_lower(m_stem);
	if (word.length() > 2) {
	stemming();
	}
	}

	stemmer_en::~stemmer_en() {
	// ...
	}

	std::string stemmer_en::get_original() const {
	return m_original;
	}

	std::string stemmer_en::get_stem() const {
	return m_stem;
	}

	bool stemmer_en::same() const {
	return m_original == m_stem;
	}

	char stemmer_en::operator[](unsigned int idx) const {
	return m_stem[idx];
	}

	bool stemmer_en::main_vowel(char c) {
	return c == 'a' \|\| c == 'e' \|\| c == 'i' \|\| c == 'o' \|\| c == 'u';
	}

	bool stemmer_en::vowel(const std::string &s, int idx) {
	return main_vowel(s[idx]) \|\| (idx > 0 && s[idx] == 'y' && !vowel(s, idx-1));
	}

	bool stemmer_en::consonant(const std::string &s, int idx) {
	return !vowel(s, idx);
	}

	bool stemmer_en::has_vowel(const std::string &s) {
	for (int i = 0; i < s.length(); ++i) {
	if (vowel(s, i)) {
	return true;
	}
	}
	return false;
	}

	bool stemmer_en::double_consonants(const std::string &s) {
	const unsigned int x = s.length() - 1;
	return consonant(s, x) && (s[x] == s[x-1]);
	}

	unsigned int stemmer_en::measure(std::string s) {
	boost::algorithm::to_lower(s);
	unsigned int m = 0;
	bool prev = vowel(s, 0);
	for (int i = 1; i < s.length(); ++i) {
	const bool curr = vowel(s, i);
	if (prev && !curr) {
	++m;
	}
	prev = curr;
	}
	return m;
	}

	bool stemmer_en::ends_with(const std::string &w, const std::string &end) {
	const size_t n0 = w.length();
	const size_t n1 = end.length();
	if (n0 < n1) {
	return false;
	}
	for (int i = 0; i < n1; ++i) {
	if (w[i + n0 - n1] != end[i]) {
	return false;
	}
	}
	return true;
	}

	bool stemmer_en::replace_list(const std::list<std::pair<std::string, std::string>> &mappings, int m) {
	const size_t n0 = m_stem.length();
	for (const auto &w : mappings) {
	const size_t n1 = w.first.length();
	if (ends_with(m_stem, w.first) && measure(m_stem.substr(0, n0 - n1)) > m) {
	const size_t nr = w.second.length();
	const int delta = n1 - nr;
	if (delta >= 0) { // n1 >= nr
	m_stem.replace(n0 - n1, nr, w.second);
	m_stem.erase(n0 - delta, delta);
	} else { // n1 < nr
	m_stem = m_stem.substr(0, n0 - n1) + w.second;
	}
	return true;
	}
	}
	return false;
	}

	bool stemmer_en::o_rule(const std::string &stem) const {
	const size_t n = stem.length();
	if (n >= 3 && vowel(stem, n-2) && consonant(stem, n-3)) {
	const char c = stem[n-1];
	return consonant(stem, n-1) && c != 'w' && c != 'x' && c != 'y';
	}
	return false;
	}

	void stemmer_en::step1a() {
	const size_t n = m_stem.length();
	if (m_stem[n-1] == 's') {
	if (m_stem[n-2] != 's') {
	m_stem.erase(n - 1, 1);
	} else if (m_stem[n-2] == 'e') {
	if (m_stem[n-3] == 'i' \|\| (m_stem[n-3] == 's' && m_stem[n-4] == 's')) {
	m_stem.erase(n - 2, 2);
	} else {
	m_stem.erase(n - 1, 1);
	}
	}
	}
	}

	void stemmer_en::step1b() {
	const size_t n = m_stem.length();
	if (n > 3 && m_stem[n-1] == 'd' && m_stem[n-2] == 'e' && m_stem[n-3] == 'e') {
	if (measure(m_stem.substr(0, n-3)) > 0) {
	m_stem.erase(n - 1, 1);
	}
	} else if (m_stem[n-1] == 'd' && m_stem[n-2] == 'e') {
	if (has_vowel(m_stem.substr(0, n-2))) {
	m_stem.erase(n - 2, 2);
	step1b_pt2();
	}
	} else if (m_stem[n-1] == 'g' && m_stem[n-2] == 'n' && m_stem[n-3] == 'i') {
	if (has_vowel(m_stem.substr(0, n-3))) {
	m_stem.erase(n - 3, 3);
	step1b_pt2();
	}
	}
	}

	void stemmer_en::step1b_pt2() {
	const bool replaced = replace_list(m_step1b, -1);
	if (!replaced) {
	const size_t n = m_stem.length();
	if (double_consonants(m_stem) && m_stem[n-1] != 'l' && m_stem[n-1] != 's' && m_stem[n-1] != 'z') {
	m_stem.erase(n - 1, 1);
	} else if (measure(m_stem) == 1 && o_rule(m_stem)) {
	m_stem += 'e';
	}
	}
	}

	void stemmer_en::step1c() {
	const size_t n = m_stem.length();
	if (m_stem[n-1] == 'y' && has_vowel(m_stem.substr(0, n - 2))) {
	m_stem[n-1] = 'i';
	}
	}

	void stemmer_en::step1() {
	step1a();
	step1b();
	step1c();
	}

	void stemmer_en::step2() {
	replace_list(m_step2, 0);
	}

	void stemmer_en::step3() {
	replace_list(m_step3, 0);
	}

	void stemmer_en::step4() {
	const bool replaced = replace_list(m_step4, 1);
	if (!replaced && ends_with(m_stem, "ion")) {
	std::string s = m_stem.substr(0, m_stem.length() - 3);
	const char c = s[s.length() -1];
	if (measure(s) > 1 && (c == 's' \|\| c == 't')) {
	m_stem.erase(m_stem.length() - 3, 3);
	}
	}
	}

	void stemmer_en::step5a() {
	const size_t n = m_stem.length();
	if (m_stem[n-1] == 'e') {
	const std::string stem = m_stem.substr(0, n-1);
	const unsigned int m = measure(stem);
	if (m > 1 \|\| ((m == 1) && !o_rule(stem))) {
	m_stem.erase(n - 1, 1);
	}
	}
	}

	void stemmer_en::step5b() {
	const size_t n = m_stem.length();
	if (measure(m_stem) > 1 && m_stem[n-1] == 'l' && m_stem[n-2] == 'l') {
	m_stem.erase(n - 1, 1);
	}
	}

	void stemmer_en::step5() {
	step5a();
	step5b();
	}

	void stemmer_en::stemming() {
	step1();
	step2();
	step3();
	step4();
	step5();
	}