Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 10, 2014 17:35
Show Gist options
  • Select an option

  • Save cigrainger/d76a067a2175e715f245 to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/d76a067a2175e715f245 to your computer and use it in GitHub Desktop.
import nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
from functools32 import lru_cache
lmtzr = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=100000)(lmtzr.lemmatize)
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
def clean(text):
tag = nltk.pos_tag(text)
return lemmatize(text, get_wordnet_pos(tag[1]))
tic = timeit.default_timer()
with open('abstracts.txt','r') as f:
with open('test.txt','w') as f2:
f2.truncate()
head = f.readlines(10000000)
for line in head:
newline = Parallel(n_jobs=32)(delayed(clean)(word) for word in line.split())
newline = ' '.join(newline)
f2.write('%s\n' % str(newline))
toc = timeit.default_timer()
time = toc-tic
print('%s abstracts processed in %s seconds.' % (len(abstracts),time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment