Skip to content

Instantly share code, notes, and snippets.

@ixtel
Forked from bbengfort/token_count.py
Created October 19, 2015 12:57
Show Gist options
  • Save ixtel/6e70ea2e85279e794bd0 to your computer and use it in GitHub Desktop.
Save ixtel/6e70ea2e85279e794bd0 to your computer and use it in GitHub Desktop.
A MapReduce that counts lemmas.
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize
class Mapper(object):
def __init__(self):
if 'stopwords' in self.params:
with open(self.params['stopwords'], 'r') as excludes:
self._stopwords = set(line.strip() for line in excludes)
else:
self._stopwords = None
self.lemmatizer = WordNetLemmatizer()
def __call__(self, key, value):
for word in self.tokenize(value):
if not word in self.stopwords:
yield word, 1
def normalize(self, word):
word = word.lower()
return self.lemmatizer.lemmatize(word)
def tokenize(self, sentence):
for word in wordpunct_tokenize(sentence):
yield self.normalize(word)
@property
def stopwords(self):
if not self._stopwords:
self._stopwords = nltk.corpus.stopwords.words('english')
return self._stopwords
def reducer(key, values):
yield key, sum(values)
def runner(job):
job.additer(Mapper, reducer, reducer)
def starter(prog):
excludes = prog.delopt("excludes")
if excludes: prog.addopt("param", "excludes="+excludes)
if __name__ == "__main__":
import dumbo
dumbo.main(runner, starter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment