Created
November 8, 2012 19:21
-
-
Save alexstorer/4040920 to your computer and use it in GitHub Desktop.
Patent Processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Use this file by navigating in the terminal to the directory where your file is located and typing: | |
| # python parsexml.py [name of xml file] | |
| # You can parse multiple xml files by using the wildcard operator. | |
| # To process every xml file in a directory, do this: | |
| # python parsexml.py *.xml | |
| from lxml import etree | |
| import re | |
| import csv | |
| import os.path as op | |
| import sys | |
| import glob | |
| class PatParse(object): | |
| def __init__(self,fname): | |
| prefix = op.splitext(op.basename(fname))[0] | |
| basenames = ['pan', 'co', 'ki', 'se', 'typ'] | |
| fpdc = open(prefix+'_'+'dc.csv','w') | |
| dc_names = ['pan', 'co', 'ki', 'se', 'typ', 'mcc', 'dccm', 'dccs', | |
| "cpi-sct","dccs","engpi-sct","dceng","epi-sct", | |
| "dce","mce"] | |
| self.dwdc = csv.DictWriter(fpdc,dc_names) | |
| self.dwdc.writeheader() | |
| fpipc = open(prefix+'_'+'ipc.csv','w') | |
| ipc_names = ['pan', 'co', 'ki', 'se', 'typ',"IPC-rank","IPC-text"] | |
| self.dwipc = csv.DictWriter(fpipc,ipc_names) | |
| self.dwipc.writeheader() | |
| fpplasdoc = open(prefix+'_'+'plasdoc.csv','w') | |
| plasdoc_names = basenames+["PCode-no","PCode-MPs","PCode-Ks"] | |
| self.dwplasdoc = csv.DictWriter(fpplasdoc,plasdoc_names) | |
| self.dwplasdoc.writeheader() | |
| fpfrag = open(prefix+'_'+'frag.csv','w') | |
| frag_names = basenames+["FragSjct","CardRec-no","CardRec-trc","Fcodes-sjct", | |
| "Fcodes-trc","Fcodes","RINs"] | |
| self.dwfrag = csv.DictWriter(fpfrag,frag_names) | |
| self.dwfrag.writeheader() | |
| fpidxu = open(prefix+'_'+'idxu.csv','w') | |
| idxu_names = basenames+["DRNUs", | |
| "RINUs", | |
| "DCRUs"] | |
| self.dwidxu = csv.DictWriter(fpidxu,idxu_names) | |
| self.dwidxu.writeheader() | |
| fppolymer = open(prefix+'_'+'polymer.csv','w') | |
| polymer_names = basenames+[ | |
| "ESent-no", | |
| "ESent-trc", | |
| "EPara-no", | |
| "ETs", | |
| "SCNDs", | |
| "DCRs"] | |
| self.dwpolymer = csv.DictWriter(fppolymer,polymer_names) | |
| self.dwpolymer.writeheader() | |
| fpki = open(prefix+'_'+'ki.csv','w') | |
| ki_names = basenames+[ | |
| "KSent-no", | |
| "KSent-flvr", | |
| "KPara-no", | |
| "DCR-rl", | |
| "DCR"] | |
| self.dwki = csv.DictWriter(fpki,ki_names) | |
| self.dwki.writeheader() | |
| fpsrc = open(prefix+'_'+'src.csv','w') | |
| src_names = basenames+[ | |
| "Cited-pan", | |
| "Cited-ki", | |
| "Cited-se", | |
| "Cited-co", | |
| "Cited-src-cat", | |
| "Cited-src"] | |
| self.dwsrc = csv.DictWriter(fpsrc,src_names) | |
| self.dwsrc.writeheader() | |
| fpjrn = open(prefix+'_'+'jrn.csv','w') | |
| jrn_names = basenames+[ | |
| "jn-cat", | |
| "jn-id", | |
| "jn-typ", | |
| "Journal Title", | |
| "year","year_test"] | |
| self.dwjrn = csv.DictWriter(fpjrn,jrn_names) | |
| self.dwjrn.writeheader() | |
| myFuncs = ['findDC','findIPC','findPlasdoc', | |
| 'findFrag','findIdxU','findPolymer', | |
| 'findKI','findCitedPt','findCitedJn'] | |
| myFobjs = ['fpdc','fpipc','fpplasdoc','fpfrag', | |
| 'fpidxu','fppolymer','fpki','fpsrc', | |
| 'fpjrn'] | |
| f = open(fname,'r') | |
| tree = etree.parse(f) | |
| pan = tree.xpath('//PanTyp1') | |
| for p in pan: | |
| #print p | |
| panval = p.get('pan') | |
| for pt in p.xpath('.//PatentTyp1'): | |
| d0 = dict() | |
| d0["pan"] = panval | |
| d0["co"] = pt.get('co') | |
| d0["co"]= pt.get('co') | |
| d0["se"]= pt.get('se') | |
| d0["ki"]= pt.get('ki') | |
| d0["typ"] = pt.get('typ') | |
| for func in myFuncs: | |
| eval('self.'+func+'(pt,d0)') | |
| for fobj in myFobjs: | |
| eval(fobj+'.close()') | |
| def findDC(self,pt,d0): | |
| dw = self.dwdc | |
| allcpi = pt.xpath('.//CPI') | |
| for c in allcpi: | |
| d = d0.copy() | |
| dccm = c.xpath('.//DCCM') | |
| dccs = c.xpath('.//DCCS') | |
| mcc = c.xpath('.//MCC') | |
| d["cpi-sct"] = c.get('sct') | |
| d["dccm"] = ','.join(map(lambda x: x.text,dccm)) | |
| d["dccs"] = ','.join(map(lambda x: x.text,dccs)) | |
| d["mcc"] = ','.join(map(lambda x: x.text,mcc)) | |
| dw.writerow(d) | |
| alleng = pt.xpath('.//EngPI') | |
| for c in alleng: | |
| d = d0.copy() | |
| dceng = c.xpath('.//DCEng') | |
| d["engpi-sct"] = c.get('sct') | |
| d["dceng"] = ','.join(map(lambda x: x.text,dceng)) | |
| dw.writerow(d) | |
| allepi = pt.xpath('.//EPI') | |
| for c in allepi: | |
| d = d0.copy() | |
| d["epi-sct"] = c.get('sct') | |
| dce = c.xpath('.//DCE') | |
| mce = c.xpath('.//MCE') | |
| d["dce"] = ','.join(map(lambda x: x.text,dce)) | |
| d["mce"] = ','.join(map(lambda x: x.text,mce)) | |
| dw.writerow(d) | |
| if (allcpi is None) and (allepi is None) and (alleng is None): | |
| dw.writerow(d0) | |
| def findIPC(self,pt,d0): | |
| dw = self.dwipc | |
| allipc = pt.xpath('.//IPC') | |
| for c in allipc: | |
| d = d0.copy() | |
| d["IPC-rank"] = c.get('rnk') | |
| d["IPC-text"] = c.text | |
| dw.writerow(d) | |
| def findPlasdoc(self,pt,d0): | |
| dw = self.dwplasdoc | |
| allp = pt.xpath('.//PCodes') | |
| for c in allp: | |
| d = d0.copy() | |
| d["PCode-no"] = c.get('no') | |
| mps = c.xpath('.//MPs') | |
| ks = c.xpath('.//K') | |
| d["PCode-MPs"] = ','.join(map(lambda x: x.text,mps)).replace(" ",",") | |
| d["PCode-Ks"] = ','.join(map(lambda x: x.text,ks)) | |
| dw.writerow(d) | |
| def findFrag(self,pt,d0): | |
| dw = self.dwfrag | |
| allf = pt.xpath('.//FragSub') | |
| for f in allf: | |
| d = d0.copy() | |
| d["FragSjct"] = f.get('sjct') | |
| allCardRec = f.xpath('.//CardRec') | |
| for c in allCardRec: | |
| d["CardRec-no"] = c.get('no') | |
| d["CardRec-trc"] = c.get('trc') | |
| allFcodes = c.xpath('.//FCodes') | |
| for fc in allFcodes: | |
| d["Fcodes-sjct"] = fc.get('sjct') | |
| d["Fcodes-trc"] = fc.get('trc') | |
| d["Fcodes"] = fc.text.replace(" ",",") | |
| allRINs = c.xpath('.//RIN') | |
| d["RINs"] = ','.join(map(lambda x: x.text,allRINs)) | |
| dw.writerow(d) | |
| def findPolymer(self,pt,d0): | |
| dw = self.dwpolymer | |
| allp = pt.xpath('.//EPolymer') | |
| for p in allp: | |
| allpara = p.xpath('.//EPara') | |
| for para in allpara: | |
| allsent = para.xpath('.//ESent') | |
| for sent in allsent: | |
| allphrase = sent.xpath('.//EPhrase') | |
| for phrase in allphrase: | |
| d = d0.copy() | |
| d["ESent-no"] = sent.get('no') | |
| d["ESent-trc"] = sent.get('trc') | |
| d["EPara-no"] = para.get('no') | |
| et = phrase.xpath('.//ETs') | |
| allscnd = phrase.xpath('.//SCND') | |
| alldcr = phrase.xpath('.//DCR') | |
| if et: | |
| d["ETs"] = et[0].text.replace(" ",",") | |
| d["SCNDs"] = ','.join(map(lambda x: x.text,allscnd)) | |
| d["DCRs"] = ','.join(map(lambda x: x.get('ext'),alldcr)) | |
| dw.writerow(d) | |
| def findKI(self,pt,d0): | |
| dw = self.dwki | |
| allpara = pt.xpath('.//KPara') | |
| for para in allpara: | |
| allsent = para.xpath('.//KSent') | |
| for sent in allsent: | |
| alldcr = sent.xpath('.//DCR') | |
| for dcr in alldcr: | |
| d = d0.copy() | |
| d["KSent-no"] = sent.get('no') | |
| d["KSent-flvr"] = sent.get('flvr') | |
| d["KPara-no"] = para.get('no') | |
| d["DCR-rl"] = dcr.get('rl') | |
| d["DCR"] = dcr.text | |
| dw.writerow(d) | |
| def findIdxU(self,pt,d0): | |
| dw = self.dwidxu | |
| allf = pt.xpath('.//IdxU') | |
| for f in allf: | |
| d = d0.copy() | |
| drnus = f.xpath('.//DRNU') | |
| rinus = f.xpath('.//RINU') | |
| dcrus = f.xpath('.//DCRU') | |
| d["DRNUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,drnus)) | |
| d["RINUs"] = ','.join(map(lambda x: x.text,rinus)) | |
| d["DCRUs"] = ','.join(map(lambda x: x.get('rl')+'-'+x.text,dcrus)) | |
| dw.writerow(d) | |
| def findCitedPt(self,pt,d0): | |
| dw = self.dwsrc | |
| allf = pt.xpath('.//CitedPt') | |
| for f in allf: | |
| d = d0.copy() | |
| d["Cited-pan"] = f.get('pan') | |
| d["Cited-ki"] = f.get('ki') | |
| d["Cited-se"] = f.get('se') | |
| d["Cited-co"] = f.get('co') | |
| citeSrc = f.xpath('.//Src') | |
| if len(citeSrc) > 0: | |
| cs = citeSrc[0] | |
| d["Cited-src-cat"] = cs.get("cat") | |
| d["Cited-src"] = cs.get("src") | |
| dw.writerow(d) | |
| def findCitedJn(self,pt,d0): | |
| dw = self.dwjrn | |
| allf = pt.xpath('.//CitedJn') | |
| for f in allf: | |
| d = d0.copy() | |
| d["jn-cat"] = f.get('cat') | |
| d["jn-id"] = f.get('id') | |
| d["jn-typ"] = f.get('typ') | |
| d["Journal Title"] = f.text | |
| if f.text is not None: | |
| year_str = re.sub('\D\d{4}\s*-\s*\d{2}\d+','',f.text) | |
| year_results_f = re.findall('\D(\d{4})\D','random'+year_str) | |
| y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f) | |
| if len(y)>0: | |
| d["year"] = max(set(y),key=y.count) | |
| year_results_f.extend(re.findall('\D(\d{4})$','random'+year_str)) | |
| y = filter(lambda x: int(x) > 1900 and int(x) < 2013,year_results_f) | |
| if len(y)>0: | |
| d["year_test"] = max(set(y),key=y.count) | |
| dw.writerow(d) | |
| for a in glob.glob(sys.argv[1]): | |
| suffix = op.splitext(op.basename(a))[1] | |
| if suffix=='.xml': | |
| print "Parsing file: ", a | |
| p = PatParse(a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment