Skip to content

Instantly share code, notes, and snippets.

@mlincoln
Created March 29, 2013 03:42
Show Gist options
  • Select an option

  • Save mlincoln/5268632 to your computer and use it in GitHub Desktop.

Select an option

Save mlincoln/5268632 to your computer and use it in GitHub Desktop.
# -*- coding: latin-1 -*-
import xml.sax
import re
import unicodedata
import codecs
class ABContentHandler(xml.sax.ContentHandler):
def __init__(self):
##define the second level item types we are interested in (these are all 8)
self.itemtypes = ("article","inproceedings","proceedings","book","incollection","phdthesis","mastersthesis","www")
##define the third level elements we are interested in (these are the details of self.itemtypes)
self.fields = ("author","title","journal","issn","year","month","volume","number","pages","ee","publisher")
##print the header
print "Key\t"+"\t".join(self.fields)
xml.sax.ContentHandler.__init__(self)
##declare our empty dictionaries for tag presence and contents
self.isElement = {}
self.Content = {}
##set all tags to not present (0)
for i in self.fields:
self.isElement[i] = 0
self.issn2="-"
def startElement(self, name, attrs):
##check if xml tag signifies the beginning of a new item (article, book, etc.)
if name in self.itemtypes:
self.articleKey=attrs.get('key','')
for i in iter(self.isElement):
self.Content[i] = '-'
##otherwise, the xml tag contains a piece of information to be read
elif name in self.fields:
self.isElement[name] = 1
##if there are multiple sets of the same tag, concatenate the contents with a pipe separator
if self.Content[name] != '-':
self.Content[name] += '|'
def characters(self, content):
for i in self.fields:
if self.isElement[i]==1:
if self.Content[i]=='-':
self.Content[i]=content
else:
self.Content[i]+= "|" + content
##dblp abbreviates urls that point to their own site, so we will prepend those to get the full, valid url
## if "db/" in self.Content[i]:
## self.Content[i]="http://dblp.uni-trier.de/"+self.Content[i]
## if "issn=" in self.Content[i]:
## if re.search(r"issn=(\w{4}-\w{4})",self.Content[i]):
## issn=re.search(r"issn=(\w{4}-\w{4})",self.Content[i])
## self.Content[issn]=issn.group(1)
def endElement(self, name):
##is there a better way to close the program after it runs through the xml file?
if name == "author":
firstname = "-"
lastname = "-"
firstinit = "-"
middleinit = "-"
suffix = "-"
# fullname = unicodedata.normalize('NFKD',self.Content[name])
fullname = self.Content[name]
splitted = fullname.split('|')
print splitted
print fullname
# fullname = unicodedata.normalize('NFKD', content).encode('utf-8','ignore')
# fullname = fullname.encode('ascii', 'replace')
print fullname
#if only one name, assign to last name and skip to next:
if len(fullname.split()) == 1:
lastname = fullname
return
#check for suffixes
lastpiece = fullname.split()[-1]
suffixes = ["jr.","sr.","ii","iii","iv","v"]
if lastpiece.lower() in suffixes:
suffix = lastpiece
lastname = fullname.split()[-2]
#grab last name
else:
lastname = lastpiece
#assign everything else to first name
firstname = fullname.split()[0:(len(fullname.split())-1)]
firstname = ' '.join(firstname)
#grab 1st, 2nd initials
namelength = len(firstname.split())
if namelength > 1:
middleinit = firstname.split()[1][0]
firstinit = firstname.split()[0][0]
else:
firstinit = firstname.split()[0][0]
#if firstname is 1 character long (an initial), get rid of it
if len(firstname.split()[0].replace(".","")) == 1:
firstname = "-"
if self.Content[name] == "-":
self.Content[name] = lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix
else:
self.Content[name] += lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix
if name == "month":
month=self.Content[name]
self.Content[name]=month[0:3].lower()
self.Months=("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
if self.Content[name] not in self.Months:
self.Content[name]="-"
if name == "ee":
if re.search(r"issn=(\w{4}-\w{4})",self.Content[name]):
self.issn2=re.search(r"issn=(\w{4}-\w{4})",self.Content[name])
self.issn2=self.issn2.group(1)
self.Content["issn"]=self.issn2
if "db/" in self.Content[name]:
self.Content[name]="http://dblp.uni-trier.de/"+self.Content[name]
if name == "dblp":
return
##if an end tag closing an item is detected, we print out all the contents in tsv
if name in self.itemtypes:
out = "\t".join(map(lambda w: self.Content[w],self.fields))
out = self.articleKey+"\t"+out
# print unicodedata.normalize('NFKD', out).encode('utf-8','ignore')
print out
self.isElement[name] = 0
def main(sourceFileName):
source = open(sourceFileName)
xml.sax.parse(source, ABContentHandler())
if __name__ == "__main__":
# main("dblp.xml")
main("removeSS.xml")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment