Created
June 13, 2010 09:54
-
-
Save filipsalo/436518 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
"""Stemmers for Swedish and English | |
Implements the Swedish stemming algorithm used in snowball: | |
<http://snowball.tartarus.org/algorithms/swedish/stemmer.html> | |
Implements the English (Porter2) stemming algorithm used in snowball: | |
<http://snowball.tartarus.org/algorithms/english/stemmer.html> | |
""" | |
__author__ = "Filip Salomonsson <[email protected]>" | |
import codecs | |
import re | |
import os | |
DEBUG = int(os.environ.get("DEBUG", 0)) | |
def _debug(s, prefix=""): | |
if DEBUG: print >> sys.stderr, prefix, s | |
def _show(word, r1=None, r2=None): | |
if r1 is None: r1 = len(word) | |
if r2 is None: r2 = len(word) | |
return "%s[%s[%s]]" % (word[:r1], word[r1:r2], word[r2:]) | |
class SwedishStemmer(object): | |
# Compile regular expression objects for s-ending and suffixes | |
_s_ending = "bcdfghjklmnoprtvy" | |
_s_ending = re.compile(ur"(%s)s$" % u"|".join(_s_ending)) | |
_suffixes = ['heterna', 'hetens', 'anden', 'heten', 'heter', 'arnas', | |
'ernas', 'ornas', 'andes', 'arens', 'andet', 'arna', | |
'erna', 'orna', 'ande', 'arne', 'aste', 'aren', 'ades', | |
'erns', 'ade', 'are', 'ern', 'ens', 'het', 'ast', 'ad', | |
'en', 'ar', 'er', 'or', 'as', 'es', 'at', 'a', 'e'] | |
_suffixes = re.compile(ur"(%s)$" % u"|".join(_suffixes)) | |
def _base_and_region(self, word): | |
"""Split a (unicode) string into base and region. region corresponds | |
to R1 in the snowball algorithm description, and base is the part | |
preceding it.""" | |
match = re.search(ur'^(.*?[aeiouyåäö][^aeiouyåäö])(.*)$', word) | |
if match: | |
# R1 is the region after the first non-vowel following a vowel[...] | |
base, region = match.groups() | |
#(R1 is adjusted so that the region before it contains at least | |
# 3 letters.)" | |
if len(base) < 3 and len(region) > 0: | |
cut_pos = 3 - len(base) | |
base = base + region[:cut_pos] | |
region = region[cut_pos:] | |
return base, region | |
else: | |
# [...] or is the null region at the end of the word if there | |
# is no such non-vowel. | |
return word, '' | |
def stem(self, word): | |
"""Perform stemming a word (as a unicode string) and | |
return the stem.""" | |
base, region = self._base_and_region(word) | |
# Step 1: | |
# If suffix found in region, delete it... | |
region, n = re.subn(self._suffixes, "", region) | |
# ..else delete "s" if s-ending in word(!) | |
# NB: this searches in entire word, not just in region! | |
if n == 0 and re.search(self._s_ending, word) is not None: | |
region = region[:-1] # ta bort s:et | |
# Step 2: | |
# If suffix matches in region, delete the last letter. | |
def strip_last_char(match): | |
return match.group(0)[:-1] | |
region = re.sub(ur"([dg]d|nn|[dgkt]t)$", strip_last_char, region) | |
# Step 3: | |
# If -fullt och -löst in region, delete trailing t... | |
region, n = re.subn(ur"(full|lös)t$", r"\1", region) | |
# ...else, if region ends with -lig,-ig or -els, delete that suffix. | |
if n == 0: | |
region, n = re.subn(ur"(l?ig|els)$", "", region) | |
# We're done! Join base and region, and return the result as the stem. | |
return base + region | |
class EnglishStemmer(object): | |
_step_1a_invariants = set(["inning", "outing", "canning", "herring", | |
"earring", "proceed", "exceed", "succeed"]) | |
_step_2_pairs = [('enci', 'ence'), ('anci', 'ance'), | |
('abli', 'able'), | |
('entli', 'ent'), | |
('izer', 'ize'), ('ization', 'ize'), | |
('ational', 'ate'), ('ation', 'ate'), ('ator', 'ate'), | |
('tional', 'tion'), | |
('alism', 'al'), ('aliti', 'al'), ('alli', 'al'), | |
('fulness', 'ful'), | |
('ousli', 'ous'), ('ousness', 'ous'), | |
('iveness', 'ive'), ('iviti', 'ive'), | |
('biliti', 'ble'), ('bli', 'ble'), | |
('fulli', 'ful'), | |
('lessli', 'less'),] | |
_step_3_pairs = [("ational", "ate"), ("tional", "tion"), ("alize", "al"), | |
("icate", "ic"), ("iciti", "ic"), ("ical", "ic"), | |
("ful", ""), ("ness", "")] | |
_step_4_suffixes = "al ance ence er ic able ible ant ement ment " \ | |
"ent ism ate iti ous ive ize".split() | |
_step_4_suffixes = re.compile(r"(?:%s)$" % "|".join(_step_4_suffixes)) | |
_li_ending = "cdeghkmnrt" | |
_doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") | |
_patterns = dict(vowel=r"[aeiouy]", nonvowel=r"[^aeiouy]", | |
nonvowelwxY=r"[^aeiouywxY]",) | |
_vc = re.compile(r"%(vowel)s %(nonvowel)s" % _patterns, re.VERBOSE) | |
_short_syllable = re.compile(r"""( | |
%(nonvowel)s %(vowel)s %(nonvowelwxY)s # (a) | |
| | |
^%(vowel)s %(nonvowel)s # (b) | |
) | |
$""" % _patterns, re.VERBOSE) | |
_short_word = re.compile(r"[^aeiouy]*" + _short_syllable.pattern, re.VERBOSE) | |
_exceptions = {"skis": "ski", | |
"skies": "sky", | |
"dying": "die", "lying": "lie", "tying": "tie", | |
"idly": "idl", "gently": "gentl", "singly": "singl", | |
"ugly": "ugli","early": "earli", "only": "onli", | |
"sky": None, "news": None, "howe": None, "atlas": None, | |
"cosmos": None, "bias": None, "andes": None, | |
} | |
def stem(self, word): | |
"""Perform stemming a word (as a unicode string) and | |
return the stem. Expects a lowercase word.""" | |
# If the word has two letters or less, leave it as it is. | |
if len(word) <= 2: return word | |
if word in self._exceptions: return self._exceptions[word] or word | |
# Remove initial apostrophe, if present. | |
if word.startswith("'"): word = word[1:] | |
# Set initial 'y', or 'y' after a vowel, to 'Y' | |
word = re.sub(ur"^y|(?<=[aeiouy])y", u"Y", word) | |
# Establish the regions R1 and R2 (as indexes) | |
r1, r2 = self._regions(word) | |
_debug(_show(word, r1, r2), "*** Stemming:") | |
# Step 0: Remove longest matching suffix | |
word = re.sub(ur"'(s'?)?$", "", word) | |
_debug(_show(word, r1, r2), "After step 0:") | |
# Step 1a: Perform action for the longest matching suffix | |
if word.endswith("sses"): | |
# replace by 'ss' | |
word = word[:-2] | |
elif word.endswith("ied") or word.endswith("ies"): | |
# replace by 'i' if preceded by more than one letter... | |
if len(word) > 4: | |
word = word[:-2] | |
# ...otherwise by 'ie' | |
else: | |
word = word[:-1] | |
elif word.endswith("us") or word.endswith("ss"): | |
# do nothing | |
pass | |
elif word.endswith("s") and re.search(ur"[aeiouy]", word[:-2]): | |
# delete if the preceding part contains a vowel | |
# not immediately before the 's' | |
word = word[:-1] | |
_debug(_show(word, r1, r2), "After step 1a:") | |
# Following step 1a, leave the following invariant | |
if word in self._step_1a_invariants: | |
return word | |
# Step 1b: Perform action for the longest matching suffix | |
match = re.search(r"eed(ly)?$", word) | |
if match: | |
start = match.start() | |
# replace by 'ee' if in R1 | |
if start >= r1: word = word[:start] + "ee" | |
else: | |
match = re.search(r"(ed|ing)(ly)?$", word) | |
if match: | |
# delete if preceding part contains a vowel... | |
if re.search(r"[aeiouy]", word[:match.start()]): | |
word = word[:match.start()] | |
# ...and then: | |
if word.endswith("at") or word.endswith("bl") \ | |
or word.endswith("iz") or word.endswith("unxxx"): | |
word += u"e" | |
elif word[-2:] in self._doubles: | |
# split doubles | |
word = word[:-1] | |
elif self._short_word.match(word): | |
word += u"e" | |
_debug(_show(word, r1, r2), "After step 1b:") | |
# Step 1c: | |
word = re.sub(r"(?<=.[^aeiouy])[yY]$", "i", word) | |
_debug(_show(word, r1, r2), "After step 1c:") | |
# Step 2: For the longest matching suffix... | |
suff = word[r1:] | |
for find, repl in self._step_2_pairs: | |
if word.endswith(find): | |
# ...perform action if suffix is in R1 | |
if suff.endswith(find): | |
word = word[:-len(find)] + repl | |
break | |
else: | |
if word[r1-1:].endswith("logi"): word = word[:-1] | |
elif suff.endswith("li") and word[-3] in self._li_ending: | |
word = word[:-2] | |
_debug(_show(word, r1, r2), "2") | |
# Step 3: Again, for the longest matching suffix... | |
suff = word[r1:] | |
for find, repl in self._step_3_pairs: | |
if suff.endswith(find): | |
word = word[:-len(find)] + repl | |
break | |
else: | |
if word[r2:].endswith("ative"): | |
word = word[:-5] | |
_debug(_show(word, r1, r2), "After step 3:") | |
# Step 4 | |
match = self._step_4_suffixes.search(word) | |
if match: | |
start = match.start() | |
if start >= r2: word = word[:start] | |
else: | |
if word[r2:].endswith("ion") and word[-4] in "st": | |
word = word[:-3] | |
_debug(_show(word, r1, r2), "After step 4:") | |
# Step 5: | |
if word[r2:].endswith("e") or \ | |
word[r1:].endswith("e") and not self._short_syllable.search(word[:-1]): | |
word = word[:-1] | |
elif word[r2-1:].endswith("ll"): word = word[:-1] | |
word = word.replace("Y", "y") | |
_debug(_show(word, r1, r2), "After step 5:") | |
return word | |
def _regions(self, word): | |
"""Calculate and return start positions for R1 and R2.""" | |
end = len(word) | |
# R1 starts after the first VC... | |
match = self._vc.search(word) | |
if match: r1 = match.end() | |
else: r1 = end | |
# ...with these exceptions, where R1 starts after the given prefix. | |
for exception in ("gener", "commun"): | |
if word.startswith(exception): | |
r1 = len(exception) | |
# R2 starts after the first VC in R1 | |
match = self._vc.search(word[r1:]) | |
if match: r2 = r1 + match.end() | |
else: r2 = len(word) | |
return r1, r2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment