Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created November 8, 2012 19:21
Show Gist options
  • Save alexstorer/4040920 to your computer and use it in GitHub Desktop.
Save alexstorer/4040920 to your computer and use it in GitHub Desktop.
Patent Processing
# Use this file by navigating in the terminal to the directory where your file is located and typing:
# python parsexml.py [name of xml file]
# You can parse multiple xml files by using the wildcard operator.
# To process every xml file in a directory, do this:
# python parsexml.py *.xml
from lxml import etree
import re
import csv
import os.path as op
import sys
import glob
class PatParse(object):
def __init__(self,fname):
prefix = op.splitext(op.basename(fname))[0]
basenames = ['pan', 'co', 'ki', 'se', 'typ']
fpdc = open(prefix+'_'+'dc.csv','w')
dc_names = ['pan', 'co', 'ki', 'se', 'typ', 'mcc', 'dccm', 'dccs',
"cpi-sct","dccs","engpi-sct","dceng","epi-sct",
"dce","mce"]
self.dwdc = csv.DictWriter(fpdc,dc_names)
self.dwdc.writeheader()
fpipc = open(prefix+'_'+'ipc.csv','w')
ipc_names = ['pan', 'co', 'ki', 'se', 'typ',"IPC-rank","IPC-text"]
self.dwipc = csv.DictWriter(fpipc,ipc_names)
self.dwipc.writeheader()
fpplasdoc = open(prefix+'_'+'plasdoc.csv','w')
plasdoc_names = basenames+["PCode-no","PCode-MPs","PCode-Ks"]
self.dwplasdoc = csv.DictWriter(fpplasdoc,plasdoc_names)
self.dwplasdoc.writeheader()
fpfrag = open(prefix+'_'+'frag.csv','w')
frag_names = basenames+["FragSjct","CardRec-no","CardRec-trc","Fcodes-sjct",
"Fcodes-trc","Fcodes","RINs"]
self.dwfrag = csv.DictWriter(fpfrag,frag_names)
self.dwfrag.writeheader()
fpidxu = open(prefix+'_'+'idxu.csv','w')
idxu_names = basenames+["DRNUs",
"RINUs",
"DCRUs"]
self.dwidxu = csv.DictWriter(fpidxu,idxu_names)
self.dwidxu.writeheader()
fppolymer = open(prefix+'_'+'polymer.csv','w')
polymer_names = basenames+[
"ESent-no",
"ESent-trc",
"EPara-no",
"ETs",
"SCNDs",
"DCRs"]
self.dwpolymer = csv.DictWriter(fppolymer,polymer_names)
self.dwpolymer.writeheader()
fpki = open(prefix+'_'+'ki.csv','w')
ki_names = basenames+[
"KSent-no",
"KSent-flvr",
"KPara-no",
"DCR-rl",
"DCR"]
self.dwki = csv.DictWriter(fpki,ki_names)
self.dwki.writeheader()
fpsrc = open(prefix+'_'+'src.csv','w')
src_names = basenames+[
"Cited-pan",
"Cited-ki",
"Cited-se",
"Cited-co",
"Cited-src-cat",
"Cited-src"]
self.dwsrc = csv.DictWriter(fpsrc,src_names)
self.dwsrc.writeheader()
fpjrn = open(prefix+'_'+'jrn.csv','w')
jrn_names = basenames+[
"jn-cat",
"jn-id",
"jn-typ",
"Journal Title",
"year","year_test"]
self.dwjrn = csv.DictWriter(fpjrn,jrn_names)
self.dwjrn.writeheader()
myFuncs = ['findDC','findIPC','findPlasdoc',
'findFrag','findIdxU','findPolymer',
'findKI','findCitedPt','findCitedJn']
myFobjs = ['fpdc','fpipc','fpplasdoc','fpfrag',
'fpidxu','fppolymer','fpki','fpsrc',
'fpjrn']
f = open(fname,'r')
tree = etree.parse(f)
pan = tree.xpath('//PanTyp1')
for p in pan:
#print p
panval = p.get('pan')
for pt in p.xpath('.//PatentTyp1'):
d0 = dict()
d0["pan"] = panval
d0["co"] = pt.get('co')
d0["co"]= pt.get('co')
d0["se"]= pt.get('se')
d0["ki"]= pt.get('ki')
d0["typ"] = pt.get('typ')
for func in myFuncs:
eval('self.'+func+'(pt,d0)')
for fobj in myFobjs:
eval(fobj+'.close()')
def findDC(self,pt,d0):
dw = self.dwdc
allcpi = pt.xpath('.//CPI')
for c in allcpi:
d = d0.copy()
dccm = c.xpath('.//DCCM')
dccs = c.xpath('.//DCCS')
mcc = c.xpath('.//MCC')
d["cpi-sct"] = c.get('sct')
d["dccm"] = ','.join(map(lambda x: x.text,dccm))
d["dccs"] = ','.join(map(lambda x: x.text,dccs))
d["mcc"] = ','.join(map(lambda x: x.text,mcc))
dw.writerow(d)
alleng = pt.xpath('.//EngPI')
for c in alleng:
d = d0.copy()
dceng = c.xpath('.//DCEng')
d["engpi-sct"] = c.get('sct')
d["dceng"] = ','.join(map(lambda x: x.text,dceng))
dw.writerow(d)
allepi = pt.xpath('.//EPI')
for c in allepi:
d = d0.copy()
d["epi-sct"] = c.get('sct')
dce = c.xpath('.//DCE')
mce = c.xpath('.//MCE')
d["dce"] = ','.join(map(lambda x: x.text,dce))
d["mce"] = ','.join(map(lambda x: x.text,mce))
dw.writerow(d)
if (allcpi is None) and (allepi is None) and (alleng is None):
dw.writerow(d0)
def findIPC(self,pt,d0):
dw = self.dwipc
allipc = pt.xpath('.//IPC')
for c in allipc:
d = d0.copy()
d["IPC-rank"] = c.get('rnk')
d["IPC-text"] = c.text
dw.writerow(d)
def findPlasdoc(self,pt,d0):
dw = self.dwplasdoc
allp = pt.xpath('.//PCodes')
for c in allp:
d = d0.copy()
d["PCode-no"] = c.get('no')
mps = c.xpath('.//MPs')
ks = c.xpath('.//K')
d["PCode-MPs"] = ','.join(map(lambda x: x.text,mps)).replace(" ",",")
d["PCode-Ks"] = ','.join(map(lambda x: x.text,ks))
dw.writerow(d)
def findFrag(self,pt,d0):
dw = self.dwfrag
allf = pt.xpath('.//FragSub')
for f in allf:
d = d0.copy()
d["FragSjct"] = f.get('sjct')
allCardRec = f.xpath('.//CardRec')
for c in allCardRec:
d["CardRec-no"] = c.get('no')
d["CardRec-trc"] = c.get('trc')
allFcodes = c.xpath('.//FCodes')
for fc in allFcodes:
d["Fcodes-sjct"] = fc.get('sjct')
d["Fcodes-trc"] = fc.get('trc')
d["Fcodes"] = fc.text.replace(" ",",")
allRINs = c.xpath('.//RIN')
d["RINs"] = ','.join(map(lambda x: x.text,allRINs))
dw.writerow(d)
def findPolymer(self,pt,d0):
dw = self.dwpolymer
allp = pt.xpath('.//EPolymer')
for p in allp:
allpara = p.xpath('.//EPara')
for para in allpara:
allsent = para.xpath('.//ESent')
for sent in allsent:
allphrase = sent.xpath('.//EPhrase')
for phrase in allphrase:
d = d0.copy()
d["ESent-no"] = sent.get('no')
d["ESent-trc"] = sent.get('trc')
d["EPara-no"] = para.get('no')
et = phrase.xpath('.//ETs')
allscnd = phrase.xpath('.//SCND')
alldcr = phrase.xpath('.//DCR')
if et:
d["ETs"] = et[0].text.replace(" ",",")
d["SCNDs"] = ','.join(map(lambda x: x.text,allscnd))
d["DCRs"] = ','.join(map(lambda x: x.get('ext'),alldcr))
dw.writerow(d)
def findKI(self,pt,d0):
dw = self.dwki
allpara = pt.xpath('.//KPara')
for para in allpara:
allsent = para.xpath('.//KSent')
for sent in allsent:
alldcr = sent.xpath('.//DCR')
for dcr in alldcr:
d = d0.copy()
d["KSent-no"] = sent.get('no')
d["KSent-flvr"] = sent.get('flvr')
d["KPara-no"] = para.get('no')
d["DCR-rl"] = dcr.get('rl')
d["DCR"] = dcr.text
dw.writerow(d)
def findIdxU(self,pt,d0):
dw = self.dwidxu
allf = pt.xpath('.//IdxU')
for f in allf:
d = d0.copy()
drnus = f.xpath('.//DRNU')
rinus = f.xpath('.//RINU')
dcrus = f.xpath('.//DCRU')
d["DRNUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,drnus))
d["RINUs"] = ','.join(map(lambda x: x.text,rinus))
d["DCRUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,dcrus))
dw.writerow(d)
def findCitedPt(self,pt,d0):
dw = self.dwsrc
allf = pt.xpath('.//CitedPt')
for f in allf:
d = d0.copy()
d["Cited-pan"] = f.get('pan')
d["Cited-ki"] = f.get('ki')
d["Cited-se"] = f.get('se')
d["Cited-co"] = f.get('co')
citeSrc = f.xpath('.//Src')
if len(citeSrc) > 0:
cs = citeSrc[0]
d["Cited-src-cat"] = cs.get("cat")
d["Cited-src"] = cs.get("src")
dw.writerow(d)
def findCitedJn(self,pt,d0):
dw = self.dwjrn
allf = pt.xpath('.//CitedJn')
for f in allf:
d = d0.copy()
d["jn-cat"] = f.get('cat')
d["jn-id"] = f.get('id')
d["jn-typ"] = f.get('typ')
d["Journal Title"] = f.text
if f.text is not None:
year_str = re.sub('\D\d{4}\s*-\s*\d{2}\d+','',f.text)
year_results_f = re.findall('\D(\d{4})\D','random'+year_str)
y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
if len(y)>0:
d["year"] = max(set(y),key=y.count)
year_results_f.extend(re.findall('\D(\d{4})$','random'+year_str))
y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
if len(y)>0:
d["year_test"] = max(set(y),key=y.count)
dw.writerow(d)
for a in glob.glob(sys.argv[1]):
suffix = op.splitext(op.basename(a))[1]
if suffix=='.xml':
print "Parsing file: ", a
p = PatParse(a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment