Created
March 29, 2013 03:42
-
-
Save mlincoln/5268632 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: latin-1 -*- | |
| import xml.sax | |
| import re | |
| import unicodedata | |
| import codecs | |
| class ABContentHandler(xml.sax.ContentHandler): | |
| def __init__(self): | |
| ##define the second level item types we are interested in (these are all 8) | |
| self.itemtypes = ("article","inproceedings","proceedings","book","incollection","phdthesis","mastersthesis","www") | |
| ##define the third level elements we are interested in (these are the details of self.itemtypes) | |
| self.fields = ("author","title","journal","issn","year","month","volume","number","pages","ee","publisher") | |
| ##print the header | |
| print "Key\t"+"\t".join(self.fields) | |
| xml.sax.ContentHandler.__init__(self) | |
| ##declare our empty dictionaries for tag presence and contents | |
| self.isElement = {} | |
| self.Content = {} | |
| ##set all tags to not present (0) | |
| for i in self.fields: | |
| self.isElement[i] = 0 | |
| self.issn2="-" | |
| def startElement(self, name, attrs): | |
| ##check if xml tag signifies the beginning of a new item (article, book, etc.) | |
| if name in self.itemtypes: | |
| self.articleKey=attrs.get('key','') | |
| for i in iter(self.isElement): | |
| self.Content[i] = '-' | |
| ##otherwise, the xml tag contains a piece of information to be read | |
| elif name in self.fields: | |
| self.isElement[name] = 1 | |
| ##if there are multiple sets of the same tag, concatenate the contents with a pipe separator | |
| if self.Content[name] != '-': | |
| self.Content[name] += '|' | |
| def characters(self, content): | |
| for i in self.fields: | |
| if self.isElement[i]==1: | |
| if self.Content[i]=='-': | |
| self.Content[i]=content | |
| else: | |
| self.Content[i]+= "|" + content | |
| ##dblp abbreviates urls that point to their own site, so we will prepend those to get the full, valid url | |
| ## if "db/" in self.Content[i]: | |
| ## self.Content[i]="http://dblp.uni-trier.de/"+self.Content[i] | |
| ## if "issn=" in self.Content[i]: | |
| ## if re.search(r"issn=(\w{4}-\w{4})",self.Content[i]): | |
| ## issn=re.search(r"issn=(\w{4}-\w{4})",self.Content[i]) | |
| ## self.Content[issn]=issn.group(1) | |
| def endElement(self, name): | |
| ##is there a better way to close the program after it runs through the xml file? | |
| if name == "author": | |
| firstname = "-" | |
| lastname = "-" | |
| firstinit = "-" | |
| middleinit = "-" | |
| suffix = "-" | |
| # fullname = unicodedata.normalize('NFKD',self.Content[name]) | |
| fullname = self.Content[name] | |
| splitted = fullname.split('|') | |
| print splitted | |
| print fullname | |
| # fullname = unicodedata.normalize('NFKD', content).encode('utf-8','ignore') | |
| # fullname = fullname.encode('ascii', 'replace') | |
| print fullname | |
| #if only one name, assign to last name and skip to next: | |
| if len(fullname.split()) == 1: | |
| lastname = fullname | |
| return | |
| #check for suffixes | |
| lastpiece = fullname.split()[-1] | |
| suffixes = ["jr.","sr.","ii","iii","iv","v"] | |
| if lastpiece.lower() in suffixes: | |
| suffix = lastpiece | |
| lastname = fullname.split()[-2] | |
| #grab last name | |
| else: | |
| lastname = lastpiece | |
| #assign everything else to first name | |
| firstname = fullname.split()[0:(len(fullname.split())-1)] | |
| firstname = ' '.join(firstname) | |
| #grab 1st, 2nd initials | |
| namelength = len(firstname.split()) | |
| if namelength > 1: | |
| middleinit = firstname.split()[1][0] | |
| firstinit = firstname.split()[0][0] | |
| else: | |
| firstinit = firstname.split()[0][0] | |
| #if firstname is 1 character long (an initial), get rid of it | |
| if len(firstname.split()[0].replace(".","")) == 1: | |
| firstname = "-" | |
| if self.Content[name] == "-": | |
| self.Content[name] = lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix | |
| else: | |
| self.Content[name] += lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix | |
| if name == "month": | |
| month=self.Content[name] | |
| self.Content[name]=month[0:3].lower() | |
| self.Months=("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec") | |
| if self.Content[name] not in self.Months: | |
| self.Content[name]="-" | |
| if name == "ee": | |
| if re.search(r"issn=(\w{4}-\w{4})",self.Content[name]): | |
| self.issn2=re.search(r"issn=(\w{4}-\w{4})",self.Content[name]) | |
| self.issn2=self.issn2.group(1) | |
| self.Content["issn"]=self.issn2 | |
| if "db/" in self.Content[name]: | |
| self.Content[name]="http://dblp.uni-trier.de/"+self.Content[name] | |
| if name == "dblp": | |
| return | |
| ##if an end tag closing an item is detected, we print out all the contents in tsv | |
| if name in self.itemtypes: | |
| out = "\t".join(map(lambda w: self.Content[w],self.fields)) | |
| out = self.articleKey+"\t"+out | |
| # print unicodedata.normalize('NFKD', out).encode('utf-8','ignore') | |
| print out | |
| self.isElement[name] = 0 | |
| def main(sourceFileName): | |
| source = open(sourceFileName) | |
| xml.sax.parse(source, ABContentHandler()) | |
| if __name__ == "__main__": | |
| # main("dblp.xml") | |
| main("removeSS.xml") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment