Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 15, 2014 12:41
Show Gist options
  • Select an option

  • Save cigrainger/4503a95d22eeebe609cd to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/4503a95d22eeebe609cd to your computer and use it in GitHub Desktop.
import nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
def clean(text):
words = nltk.word_tokenize(text)
tags = nltk.pos_tag(words)
return ' '.join(
lmtzr.lemmatize(word, get_wordnet_pos(tag[1]))
for word, tag in zip(words, tags)
)
def chunker(seq, size):
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
abstracts = open('abstracts.txt').read().split('\n')
len_abs = len(abstracts)
with open("abstractsfinal.txt","w") as f:
chunksize = 50000
num_abs = chunksize
for group in chunker(abstracts,chunksize):
abstractsnew = Parallel(n_jobs=32)(delayed(clean)(line) for line in group)
for i in abstractsnew:
f.write('%s\n' % i)
print('%s out of %s abstracts processed.' % (num_abs,len_abs))
num_abs = num_abs+chunksize
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment