Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 9, 2014 10:13
Show Gist options
  • Select an option

  • Save cigrainger/e20ec343be89c944616b to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/e20ec343be89c944616b to your computer and use it in GitHub Desktop.
import re, string, sys, nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
shortword = re.compile(r'\W*\b\w{1,2}\b')
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
def clean(text):
text = pattern.sub('', text.lower().replace('<image>', '').replace('\r', '').replace('\n', ''))
words = nltk.word_tokenize(shortword.sub('', text))
tags = nltk.pos_tag(words)
return ' '.join(
lmtzr.lemmatize(word, get_wordnet_pos(tag[1]))
for word, tag in zip(words, tags)
)
def final_clean(text):
a = text.split(',',1)
if len(a)==2:
b = clean(a[1].replace(',',''))
return b
else:
print('None -- skipping line.')
tic = timeit.default_timer()
with open("newabstracts.txt","rb") as f:
abstracts = Parallel(n_jobs=32)(delayed(final_clean)(line) for line in f)
toc = timeit.default_timer()
time = toc-tic
print('Writing to file now. %s abstracts processed in %s seconds.' % (len(abstracts),time))
with open("abstractsfinal.txt","w") as f:
f.truncate()
for item in abstracts[1:]:
f.write('%s\n' % item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment