Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created April 28, 2014 14:41
Show Gist options
  • Select an option

  • Save cigrainger/11374165 to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/11374165 to your computer and use it in GitHub Desktop.
import sys
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
import re, string
pattern=re.compile(r'[^a-zA-Z ]')
def clean(x):
x = x.replace('<image>','')
x = pattern.sub('',x.lower())
x = x.replace('\r','')
x = x.replace('\n','')
x = x.split(' ')
y = []
for i in x:
y.append(lmtzr.lemmatize(i))
x = ' '.join(y)
return(x)
with open("C:\Users\graingec\spillovers\data\patents_abstracts.csv","rb") as f:
with open("C:\Users\graingec\spillovers\\abstracts\\abstracts.txt","w") as f2:
with open("C:\Users\graingec\spillovers\\abstracts\\patentids.txt","w") as f3:
f2.truncate()
for line in f:
y = line.split(',',1)
if len(y)==2:
c = y[1].replace(',','')
a = clean(c)
f2.write(a + '\n')
f3.write(y[0] + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment