Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created June 6, 2014 13:19
Show Gist options
  • Select an option

  • Save cigrainger/11fc615aed9965ba29c0 to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/11fc615aed9965ba29c0 to your computer and use it in GitHub Desktop.
import re, string, sys, nltk, timeit
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from joblib import Parallel, delayed
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
tag_to_type = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'R': wordnet.ADV}
def get_wordnet_pos(treebank_tag):
return tag_to_type.get(treebank_tag[:1], wordnet.NOUN)
def clean(text):
text = pattern.sub('', text.lower().replace('<image>', '').replace('\r', '').replace('\n', ''))
words = nltk.word_tokenize(text)
tags = nltk.pos_tag(words)
return ' '.join(
lmtzr.lemmatize(word, get_wordnet_pos(tag[1]))
for word, tag in zip(words, tags)
)
def final_clean(x):
if 'applnabstract' in str(x):
print('Skipping header.')
else:
a = x.split(',',1)
if len(a)==2:
b = clean(a[1])
return b
tic = timeit.default_timer()
with open("abstracts.txt","rb") as f:
head = f.readlines(1000)
abstracts = Parallel(n_jobs=32)(delayed(final_clean)(line) for line in head)
f.close()
toc = timeit.default_timer()
time = toc-tic
print('Writing to file now. %s abstracts processed in %s seconds.' % (len(abstracts),time))
with open("abstractsfinal.txt","w") as f:
f.truncate()
for item in abstracts:
print>>f, item
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment