Last active
April 8, 2019 08:52
-
-
Save wassname/7fd4c975883074a99864 to your computer and use it in GitHub Desktop.
A wrapper around the nltk snowball stemmer with a reverse lookup table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ntlk | |
class SnowCastleStemmer(nltk.stem.SnowballStemmer): | |
""" A wrapper around snowball stemmer with a reverse lookip table """ | |
def __init__(self, *args, **kwargs): | |
super(self.__class__, self).__init__(*args, **kwargs) | |
self._stem_memory = defaultdict(set) | |
# switch stem and memstem | |
self._stem=self.stem | |
self.stem=self.memstem | |
def memstem(self, word): | |
""" Wrapper around stem that remembers """ | |
stemmed_word = self._stem(word) | |
self._stem_memory[stemmed_word].add(word) | |
return stemmed_word | |
def unstem(self, stemmed_word): | |
""" Reverse lookup """ | |
return sorted(self._stem_memory[stemmed_word], key=len) | |
if __name__=='__main__': | |
stemmer= SnowCastleStemmer('english') | |
stemmer.stem("building") | |
stemmer.stem("build") | |
stemmer.stem("builds") | |
assert(['build', 'builds', 'building'] == stemmer.unstem("build")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the gist - could you merge the two small changes in the import statements that I put in from the fork?