Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created May 23, 2014 15:05
Show Gist options
  • Select an option

  • Save cigrainger/7b03745e72241c69033a to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/7b03745e72241c69033a to your computer and use it in GitHub Desktop.
import re, string, sys, nltk
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
pattern=re.compile(r'[^a-zA-Z ]')
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = nltk.word_tokenize(x)
tag = nltk.pos_tag(x)
pos = []
for i in range(len(tag)):
pos.append(get_wordnet_pos(tag[i][1]))
y = []
for i in range(len(x)):
y.append(lmtzr.lemmatize(x[i],pos[i]))
x = ' '.join(y)
return(x)
with open("C:\Users\graingec\spillovers\\abstracts\\abstracts.txt","rb") as f:
with open("C:\Users\graingec\spillovers\\abstracts\\abstractsfinal.txt","w") as f2:
with open("C:\Users\graingec\spillovers\\abstracts\\patentids.txt","w") as f3:
f2.truncate()
f3.truncate()
for line in f:
y = line.split(',',1)
if len(y)==2:
c = y[1].replace(',','')
a = clean(c)
f2.write(a + '\n')
f3.write(y[0] + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment