cigrainger · May 16, 2014 14:08
diff --git a/gistfile1.py b/gistfile1.py
 import re, string, sys, nltk
 from nltk.stem.wordnet import WordNetLemmatizer

 lmtzr = WordNetLemmatizer()
 pattern=re.compile(r'[^a-zA-Z ]')

 def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

 def clean(x):
 	x = x.replace('<image>','')
 	x = pattern.sub('',x.lower())
 	x = x.replace('\r','')
 	x = x.replace('\n','')
 	x = nltk.word_tokenize(x)
 	tag = nltk.pos_tag(x)
 	pos = []
 	for i in range(len(tag)):
 		pos.append(get_wordnet_pos(tag[i][1]))
 	y = []
 	for i in range(len(x)):
 		y.append(lmtzr.lemmatize(x[i],pos[i]))
 	x = ' '.join(y)
 	return(x)

 with open("C:\Users\graingec\spillovers\data\patents_abstracts.csv","rb") as f:
    with open("C:\Users\graingec\spillovers\\abstracts\\abstracts.txt","w") as f2:
    	with open("C:\Users\graingec\spillovers\\abstracts\\patentids.txt","w") as f3:
 	        f2.truncate()
 	        f3.truncate()
 	        for line in f:
 	            y = line.split(',',1)
 	            if len(y)==2:
 	                c = y[1].replace(',','')
 	                a = clean(c)
 	                f2.write(a + '\n')
 	                f3.write(y[0] + '\n')
	import re, string, sys, nltk
	from nltk.stem.wordnet import WordNetLemmatizer

	lmtzr = WordNetLemmatizer()
	pattern=re.compile(r'[^a-zA-Z ]')

	def get_wordnet_pos(treebank_tag):
	if treebank_tag.startswith('J'):
	return wordnet.ADJ
	elif treebank_tag.startswith('V'):
	return wordnet.VERB
	elif treebank_tag.startswith('R'):
	return wordnet.ADV
	else:
	return wordnet.NOUN

	def clean(x):
	x = x.replace('<image>','')
	x = pattern.sub('',x.lower())
	x = x.replace('\r','')
	x = x.replace('\n','')
	x = nltk.word_tokenize(x)
	tag = nltk.pos_tag(x)
	pos = []
	for i in range(len(tag)):
	pos.append(get_wordnet_pos(tag[i][1]))
	y = []
	for i in range(len(x)):
	y.append(lmtzr.lemmatize(x[i],pos[i]))
	x = ' '.join(y)
	return(x)

	with open("C:\Users\graingec\spillovers\data\patents_abstracts.csv","rb") as f:
	with open("C:\Users\graingec\spillovers\\abstracts\\abstracts.txt","w") as f2:
	with open("C:\Users\graingec\spillovers\\abstracts\\patentids.txt","w") as f3:
	f2.truncate()
	f3.truncate()
	for line in f:
	y = line.split(',',1)
	if len(y)==2:
	c = y[1].replace(',','')
	a = clean(c)
	f2.write(a + '\n')
	f3.write(y[0] + '\n')
No results found