alexstorer · November 8, 2012 19:21
diff --git a/parsexml.py b/parsexml.py
 # Use this file by navigating in the terminal to the directory where your file is located and typing:
 # python parsexml.py [name of xml file]
 # You can parse multiple xml files by using the wildcard operator.
 # To process every xml file in a directory, do this:
 # python parsexml.py *.xml

 from lxml import etree
 import re
 import csv
 import os.path as op
 import sys
 import glob

 class PatParse(object):

    def __init__(self,fname):
        
        prefix = op.splitext(op.basename(fname))[0]
        
        basenames = ['pan', 'co', 'ki', 'se', 'typ']        
        fpdc = open(prefix+'_'+'dc.csv','w')
        dc_names = ['pan', 'co', 'ki', 'se', 'typ', 'mcc', 'dccm', 'dccs',
                    "cpi-sct","dccs","engpi-sct","dceng","epi-sct",
                    "dce","mce"]
        self.dwdc = csv.DictWriter(fpdc,dc_names)
        self.dwdc.writeheader()


        fpipc = open(prefix+'_'+'ipc.csv','w')
        ipc_names = ['pan', 'co', 'ki', 'se', 'typ',"IPC-rank","IPC-text"]
        self.dwipc = csv.DictWriter(fpipc,ipc_names)
        self.dwipc.writeheader()

        fpplasdoc = open(prefix+'_'+'plasdoc.csv','w')
        plasdoc_names = basenames+["PCode-no","PCode-MPs","PCode-Ks"]
        self.dwplasdoc = csv.DictWriter(fpplasdoc,plasdoc_names)
        self.dwplasdoc.writeheader()

        fpfrag = open(prefix+'_'+'frag.csv','w')
        frag_names = basenames+["FragSjct","CardRec-no","CardRec-trc","Fcodes-sjct",
                                   "Fcodes-trc","Fcodes","RINs"]
        self.dwfrag = csv.DictWriter(fpfrag,frag_names)
        self.dwfrag.writeheader()

        fpidxu = open(prefix+'_'+'idxu.csv','w')
        idxu_names = basenames+["DRNUs",
                                "RINUs",
                                "DCRUs"]
        self.dwidxu = csv.DictWriter(fpidxu,idxu_names)
        self.dwidxu.writeheader()


        fppolymer = open(prefix+'_'+'polymer.csv','w')
        polymer_names = basenames+[
                            "ESent-no",
                            "ESent-trc",
                            "EPara-no",
                            "ETs",
                            "SCNDs",
                            "DCRs"]
        self.dwpolymer = csv.DictWriter(fppolymer,polymer_names)
        self.dwpolymer.writeheader()

        fpki = open(prefix+'_'+'ki.csv','w')
        ki_names = basenames+[
                        "KSent-no",
                        "KSent-flvr", 
                        "KPara-no",
                        "DCR-rl",
                        "DCR"]
        self.dwki = csv.DictWriter(fpki,ki_names)
        self.dwki.writeheader()

        fpsrc = open(prefix+'_'+'src.csv','w')
        src_names = basenames+[
                "Cited-pan",
                "Cited-ki",
                "Cited-se",
                "Cited-co",
                "Cited-src-cat",
                "Cited-src"]

        self.dwsrc = csv.DictWriter(fpsrc,src_names)
        self.dwsrc.writeheader()

        fpjrn = open(prefix+'_'+'jrn.csv','w')
        jrn_names = basenames+[
                "jn-cat",
                "jn-id",
                "jn-typ",
                "Journal Title",
                "year","year_test"]

        self.dwjrn = csv.DictWriter(fpjrn,jrn_names)
        self.dwjrn.writeheader()

        myFuncs = ['findDC','findIPC','findPlasdoc',
                   'findFrag','findIdxU','findPolymer',
                   'findKI','findCitedPt','findCitedJn']

        myFobjs = ['fpdc','fpipc','fpplasdoc','fpfrag',
                   'fpidxu','fppolymer','fpki','fpsrc',
                   'fpjrn']        
        
        f = open(fname,'r')
        tree = etree.parse(f)
        pan = tree.xpath('//PanTyp1')
        for p in pan:
            #print p
            panval = p.get('pan')
            for pt in p.xpath('.//PatentTyp1'):
                d0 = dict()
                d0["pan"] = panval
                d0["co"] = pt.get('co')
                d0["co"]= pt.get('co')
                d0["se"]= pt.get('se')
                d0["ki"]= pt.get('ki')
                d0["typ"] = pt.get('typ')
                for func in myFuncs:
                    eval('self.'+func+'(pt,d0)')

        for fobj in myFobjs:
            eval(fobj+'.close()')        
            
    def findDC(self,pt,d0):
        dw = self.dwdc        
        allcpi = pt.xpath('.//CPI')
        for c in allcpi:
            d = d0.copy()
            dccm = c.xpath('.//DCCM')
            dccs = c.xpath('.//DCCS')
            mcc = c.xpath('.//MCC')
            d["cpi-sct"] = c.get('sct')
            d["dccm"] = ','.join(map(lambda x: x.text,dccm))
            d["dccs"] = ','.join(map(lambda x: x.text,dccs))
            d["mcc"] = ','.join(map(lambda x: x.text,mcc))
            dw.writerow(d)
            alleng = pt.xpath('.//EngPI')
            for c in alleng:
                d = d0.copy()
                dceng = c.xpath('.//DCEng')
                d["engpi-sct"] = c.get('sct')
                d["dceng"] = ','.join(map(lambda x: x.text,dceng))
                dw.writerow(d)
                allepi = pt.xpath('.//EPI')
                for c in allepi:
                    d = d0.copy()
                    d["epi-sct"] = c.get('sct')
                    dce = c.xpath('.//DCE')
                    mce = c.xpath('.//MCE')
                    d["dce"] = ','.join(map(lambda x: x.text,dce))
                    d["mce"] = ','.join(map(lambda x: x.text,mce))
                    dw.writerow(d)

        if (allcpi is None) and (allepi is None) and (alleng is None):
            dw.writerow(d0)

    def findIPC(self,pt,d0):
        dw = self.dwipc
        allipc = pt.xpath('.//IPC')
        for c in allipc:
            d = d0.copy()
            d["IPC-rank"] = c.get('rnk')
            d["IPC-text"] = c.text
            dw.writerow(d)

    def findPlasdoc(self,pt,d0):
        dw = self.dwplasdoc
        allp = pt.xpath('.//PCodes')
        for c in allp:
            d = d0.copy()
            d["PCode-no"] = c.get('no')
            mps = c.xpath('.//MPs')
            ks = c.xpath('.//K')

            d["PCode-MPs"] = ','.join(map(lambda x: x.text,mps)).replace(" ",",")
            d["PCode-Ks"] = ','.join(map(lambda x: x.text,ks))
            dw.writerow(d)

    def findFrag(self,pt,d0):
        dw = self.dwfrag
        allf = pt.xpath('.//FragSub')
        for f in allf:
            d = d0.copy()
            d["FragSjct"] = f.get('sjct')
            allCardRec = f.xpath('.//CardRec')
            for c in allCardRec:
                d["CardRec-no"] = c.get('no')
                d["CardRec-trc"] = c.get('trc')
                allFcodes = c.xpath('.//FCodes')
                for fc in allFcodes:
                    d["Fcodes-sjct"] = fc.get('sjct')
                    d["Fcodes-trc"] = fc.get('trc')
                    d["Fcodes"] = fc.text.replace(" ",",")
                    allRINs = c.xpath('.//RIN')                
                    d["RINs"] = ','.join(map(lambda x: x.text,allRINs))
                    dw.writerow(d)

    def findPolymer(self,pt,d0):
        dw = self.dwpolymer
        allp = pt.xpath('.//EPolymer')
        for p in allp:
            allpara = p.xpath('.//EPara')
            for para in allpara:
                allsent = para.xpath('.//ESent')
                for sent in allsent:
                    allphrase = sent.xpath('.//EPhrase')
                    for phrase in allphrase:
                        d = d0.copy()
                        d["ESent-no"] = sent.get('no')
                        d["ESent-trc"] = sent.get('trc')
                        d["EPara-no"] = para.get('no')
                        et = phrase.xpath('.//ETs')
                        allscnd = phrase.xpath('.//SCND')
                        alldcr = phrase.xpath('.//DCR')
                        if et:
                            d["ETs"] = et[0].text.replace(" ",",")
                            d["SCNDs"] = ','.join(map(lambda x: x.text,allscnd))
                            d["DCRs"] = ','.join(map(lambda x: x.get('ext'),alldcr))
                            dw.writerow(d)


    def findKI(self,pt,d0):
        dw = self.dwki
        allpara = pt.xpath('.//KPara')
        for para in allpara:
            allsent = para.xpath('.//KSent')
            for sent in allsent:
                alldcr = sent.xpath('.//DCR')
                for dcr in alldcr:
                    d = d0.copy()
                    d["KSent-no"] = sent.get('no')
                    d["KSent-flvr"] = sent.get('flvr')
                    d["KPara-no"] = para.get('no')
                    d["DCR-rl"] = dcr.get('rl')
                    d["DCR"] = dcr.text
                    dw.writerow(d)


    def findIdxU(self,pt,d0):
        dw = self.dwidxu
        allf = pt.xpath('.//IdxU')
        for f in allf:
            d = d0.copy()
            drnus = f.xpath('.//DRNU')
            rinus = f.xpath('.//RINU')
            dcrus = f.xpath('.//DCRU')
            d["DRNUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,drnus))
            d["RINUs"] = ','.join(map(lambda x: x.text,rinus))
            d["DCRUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,dcrus))
            dw.writerow(d)

    def findCitedPt(self,pt,d0):
        dw = self.dwsrc
        allf = pt.xpath('.//CitedPt')
        for f in allf:
            d = d0.copy()
            d["Cited-pan"] = f.get('pan')
            d["Cited-ki"] = f.get('ki')
            d["Cited-se"] = f.get('se')
            d["Cited-co"] = f.get('co')
            citeSrc = f.xpath('.//Src')
            if len(citeSrc) > 0:
                cs = citeSrc[0]
                d["Cited-src-cat"] = cs.get("cat")
                d["Cited-src"] = cs.get("src")
                dw.writerow(d)

    def findCitedJn(self,pt,d0):
        dw = self.dwjrn
        allf = pt.xpath('.//CitedJn')
        for f in allf:
            d = d0.copy()
            d["jn-cat"] = f.get('cat')
            d["jn-id"] = f.get('id')
            d["jn-typ"] = f.get('typ')
            d["Journal Title"] = f.text
            if f.text is not None:
                year_str = re.sub('\D\d{4}\s*-\s*\d{2}\d+','',f.text)
                year_results_f = re.findall('\D(\d{4})\D','random'+year_str)
                y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
                if len(y)>0:
                    d["year"] = max(set(y),key=y.count)
                    year_results_f.extend(re.findall('\D(\d{4})$','random'+year_str))
                    y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
                    if len(y)>0:
                        d["year_test"] = max(set(y),key=y.count)


            dw.writerow(d)

 for a in glob.glob(sys.argv[1]):
    suffix = op.splitext(op.basename(a))[1]
    if suffix=='.xml':
        print "Parsing file: ", a
        p = PatParse(a)
	# Use this file by navigating in the terminal to the directory where your file is located and typing:
	# python parsexml.py [name of xml file]
	# You can parse multiple xml files by using the wildcard operator.
	# To process every xml file in a directory, do this:
	# python parsexml.py *.xml

	from lxml import etree
	import re
	import csv
	import os.path as op
	import sys
	import glob

	class PatParse(object):

	def __init__(self,fname):

	prefix = op.splitext(op.basename(fname))[0]

	basenames = ['pan', 'co', 'ki', 'se', 'typ']
	fpdc = open(prefix+'_'+'dc.csv','w')
	dc_names = ['pan', 'co', 'ki', 'se', 'typ', 'mcc', 'dccm', 'dccs',
	"cpi-sct","dccs","engpi-sct","dceng","epi-sct",
	"dce","mce"]
	self.dwdc = csv.DictWriter(fpdc,dc_names)
	self.dwdc.writeheader()


	fpipc = open(prefix+'_'+'ipc.csv','w')
	ipc_names = ['pan', 'co', 'ki', 'se', 'typ',"IPC-rank","IPC-text"]
	self.dwipc = csv.DictWriter(fpipc,ipc_names)
	self.dwipc.writeheader()

	fpplasdoc = open(prefix+'_'+'plasdoc.csv','w')
	plasdoc_names = basenames+["PCode-no","PCode-MPs","PCode-Ks"]
	self.dwplasdoc = csv.DictWriter(fpplasdoc,plasdoc_names)
	self.dwplasdoc.writeheader()

	fpfrag = open(prefix+'_'+'frag.csv','w')
	frag_names = basenames+["FragSjct","CardRec-no","CardRec-trc","Fcodes-sjct",
	"Fcodes-trc","Fcodes","RINs"]
	self.dwfrag = csv.DictWriter(fpfrag,frag_names)
	self.dwfrag.writeheader()

	fpidxu = open(prefix+'_'+'idxu.csv','w')
	idxu_names = basenames+["DRNUs",
	"RINUs",
	"DCRUs"]
	self.dwidxu = csv.DictWriter(fpidxu,idxu_names)
	self.dwidxu.writeheader()


	fppolymer = open(prefix+'_'+'polymer.csv','w')
	polymer_names = basenames+[
	"ESent-no",
	"ESent-trc",
	"EPara-no",
	"ETs",
	"SCNDs",
	"DCRs"]
	self.dwpolymer = csv.DictWriter(fppolymer,polymer_names)
	self.dwpolymer.writeheader()

	fpki = open(prefix+'_'+'ki.csv','w')
	ki_names = basenames+[
	"KSent-no",
	"KSent-flvr",
	"KPara-no",
	"DCR-rl",
	"DCR"]
	self.dwki = csv.DictWriter(fpki,ki_names)
	self.dwki.writeheader()

	fpsrc = open(prefix+'_'+'src.csv','w')
	src_names = basenames+[
	"Cited-pan",
	"Cited-ki",
	"Cited-se",
	"Cited-co",
	"Cited-src-cat",
	"Cited-src"]

	self.dwsrc = csv.DictWriter(fpsrc,src_names)
	self.dwsrc.writeheader()

	fpjrn = open(prefix+'_'+'jrn.csv','w')
	jrn_names = basenames+[
	"jn-cat",
	"jn-id",
	"jn-typ",
	"Journal Title",
	"year","year_test"]

	self.dwjrn = csv.DictWriter(fpjrn,jrn_names)
	self.dwjrn.writeheader()

	myFuncs = ['findDC','findIPC','findPlasdoc',
	'findFrag','findIdxU','findPolymer',
	'findKI','findCitedPt','findCitedJn']

	myFobjs = ['fpdc','fpipc','fpplasdoc','fpfrag',
	'fpidxu','fppolymer','fpki','fpsrc',
	'fpjrn']

	f = open(fname,'r')
	tree = etree.parse(f)
	pan = tree.xpath('//PanTyp1')
	for p in pan:
	#print p
	panval = p.get('pan')
	for pt in p.xpath('.//PatentTyp1'):
	d0 = dict()
	d0["pan"] = panval
	d0["co"] = pt.get('co')
	d0["co"]= pt.get('co')
	d0["se"]= pt.get('se')
	d0["ki"]= pt.get('ki')
	d0["typ"] = pt.get('typ')
	for func in myFuncs:
	eval('self.'+func+'(pt,d0)')

	for fobj in myFobjs:
	eval(fobj+'.close()')

	def findDC(self,pt,d0):
	dw = self.dwdc
	allcpi = pt.xpath('.//CPI')
	for c in allcpi:
	d = d0.copy()
	dccm = c.xpath('.//DCCM')
	dccs = c.xpath('.//DCCS')
	mcc = c.xpath('.//MCC')
	d["cpi-sct"] = c.get('sct')
	d["dccm"] = ','.join(map(lambda x: x.text,dccm))
	d["dccs"] = ','.join(map(lambda x: x.text,dccs))
	d["mcc"] = ','.join(map(lambda x: x.text,mcc))
	dw.writerow(d)
	alleng = pt.xpath('.//EngPI')
	for c in alleng:
	d = d0.copy()
	dceng = c.xpath('.//DCEng')
	d["engpi-sct"] = c.get('sct')
	d["dceng"] = ','.join(map(lambda x: x.text,dceng))
	dw.writerow(d)
	allepi = pt.xpath('.//EPI')
	for c in allepi:
	d = d0.copy()
	d["epi-sct"] = c.get('sct')
	dce = c.xpath('.//DCE')
	mce = c.xpath('.//MCE')
	d["dce"] = ','.join(map(lambda x: x.text,dce))
	d["mce"] = ','.join(map(lambda x: x.text,mce))
	dw.writerow(d)

	if (allcpi is None) and (allepi is None) and (alleng is None):
	dw.writerow(d0)

	def findIPC(self,pt,d0):
	dw = self.dwipc
	allipc = pt.xpath('.//IPC')
	for c in allipc:
	d = d0.copy()
	d["IPC-rank"] = c.get('rnk')
	d["IPC-text"] = c.text
	dw.writerow(d)

	def findPlasdoc(self,pt,d0):
	dw = self.dwplasdoc
	allp = pt.xpath('.//PCodes')
	for c in allp:
	d = d0.copy()
	d["PCode-no"] = c.get('no')
	mps = c.xpath('.//MPs')
	ks = c.xpath('.//K')

	d["PCode-MPs"] = ','.join(map(lambda x: x.text,mps)).replace(" ",",")
	d["PCode-Ks"] = ','.join(map(lambda x: x.text,ks))
	dw.writerow(d)

	def findFrag(self,pt,d0):
	dw = self.dwfrag
	allf = pt.xpath('.//FragSub')
	for f in allf:
	d = d0.copy()
	d["FragSjct"] = f.get('sjct')
	allCardRec = f.xpath('.//CardRec')
	for c in allCardRec:
	d["CardRec-no"] = c.get('no')
	d["CardRec-trc"] = c.get('trc')
	allFcodes = c.xpath('.//FCodes')
	for fc in allFcodes:
	d["Fcodes-sjct"] = fc.get('sjct')
	d["Fcodes-trc"] = fc.get('trc')
	d["Fcodes"] = fc.text.replace(" ",",")
	allRINs = c.xpath('.//RIN')
	d["RINs"] = ','.join(map(lambda x: x.text,allRINs))
	dw.writerow(d)

	def findPolymer(self,pt,d0):
	dw = self.dwpolymer
	allp = pt.xpath('.//EPolymer')
	for p in allp:
	allpara = p.xpath('.//EPara')
	for para in allpara:
	allsent = para.xpath('.//ESent')
	for sent in allsent:
	allphrase = sent.xpath('.//EPhrase')
	for phrase in allphrase:
	d = d0.copy()
	d["ESent-no"] = sent.get('no')
	d["ESent-trc"] = sent.get('trc')
	d["EPara-no"] = para.get('no')
	et = phrase.xpath('.//ETs')
	allscnd = phrase.xpath('.//SCND')
	alldcr = phrase.xpath('.//DCR')
	if et:
	d["ETs"] = et[0].text.replace(" ",",")
	d["SCNDs"] = ','.join(map(lambda x: x.text,allscnd))
	d["DCRs"] = ','.join(map(lambda x: x.get('ext'),alldcr))
	dw.writerow(d)


	def findKI(self,pt,d0):
	dw = self.dwki
	allpara = pt.xpath('.//KPara')
	for para in allpara:
	allsent = para.xpath('.//KSent')
	for sent in allsent:
	alldcr = sent.xpath('.//DCR')
	for dcr in alldcr:
	d = d0.copy()
	d["KSent-no"] = sent.get('no')
	d["KSent-flvr"] = sent.get('flvr')
	d["KPara-no"] = para.get('no')
	d["DCR-rl"] = dcr.get('rl')
	d["DCR"] = dcr.text
	dw.writerow(d)


	def findIdxU(self,pt,d0):
	dw = self.dwidxu
	allf = pt.xpath('.//IdxU')
	for f in allf:
	d = d0.copy()
	drnus = f.xpath('.//DRNU')
	rinus = f.xpath('.//RINU')
	dcrus = f.xpath('.//DCRU')
	d["DRNUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,drnus))
	d["RINUs"] = ','.join(map(lambda x: x.text,rinus))
	d["DCRUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,dcrus))
	dw.writerow(d)

	def findCitedPt(self,pt,d0):
	dw = self.dwsrc
	allf = pt.xpath('.//CitedPt')
	for f in allf:
	d = d0.copy()
	d["Cited-pan"] = f.get('pan')
	d["Cited-ki"] = f.get('ki')
	d["Cited-se"] = f.get('se')
	d["Cited-co"] = f.get('co')
	citeSrc = f.xpath('.//Src')
	if len(citeSrc) > 0:
	cs = citeSrc[0]
	d["Cited-src-cat"] = cs.get("cat")
	d["Cited-src"] = cs.get("src")
	dw.writerow(d)

	def findCitedJn(self,pt,d0):
	dw = self.dwjrn
	allf = pt.xpath('.//CitedJn')
	for f in allf:
	d = d0.copy()
	d["jn-cat"] = f.get('cat')
	d["jn-id"] = f.get('id')
	d["jn-typ"] = f.get('typ')
	d["Journal Title"] = f.text
	if f.text is not None:
	year_str = re.sub('\D\d{4}\s-\s\d{2}\d+','',f.text)
	year_results_f = re.findall('\D(\d{4})\D','random'+year_str)
	y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
	if len(y)>0:
	d["year"] = max(set(y),key=y.count)
	year_results_f.extend(re.findall('\D(\d{4})$','random'+year_str))
	y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f)
	if len(y)>0:
	d["year_test"] = max(set(y),key=y.count)


	dw.writerow(d)

	for a in glob.glob(sys.argv[1]):
	suffix = op.splitext(op.basename(a))[1]
	if suffix=='.xml':
	print "Parsing file: ", a
	p = PatParse(a)