mlincoln · March 29, 2013 03:42
diff --git a/gistfile1.txt b/gistfile1.txt
 # -*- coding: latin-1 -*-
 import xml.sax
 import re
 import unicodedata
 import codecs
 
 class ABContentHandler(xml.sax.ContentHandler):
  def __init__(self):
 ##define the second level item types we are interested in (these are all 8)
    self.itemtypes = ("article","inproceedings","proceedings","book","incollection","phdthesis","mastersthesis","www")
 ##define the third level elements we are interested in (these are the details of self.itemtypes)
    self.fields = ("author","title","journal","issn","year","month","volume","number","pages","ee","publisher")
 ##print the header
    print "Key\t"+"\t".join(self.fields)
    xml.sax.ContentHandler.__init__(self)
 ##declare our empty dictionaries for tag presence and contents
    self.isElement = {}
    self.Content = {}
 ##set all tags to not present (0)
    for i in self.fields:
        self.isElement[i] = 0
        self.issn2="-"

  def startElement(self, name, attrs):
 ##check if xml tag signifies the beginning of a new item (article, book, etc.)
    if name in self.itemtypes:
        self.articleKey=attrs.get('key','')
        for i in iter(self.isElement):
            self.Content[i] = '-'
 ##otherwise, the xml tag contains a piece of information to be read
    elif name in self.fields:
        self.isElement[name] = 1
 ##if there are multiple sets of the same tag, concatenate the contents with a pipe separator
        if self.Content[name] != '-':
            self.Content[name] += '|'
 
  def characters(self, content):
    for i in self.fields:
        if self.isElement[i]==1:
          if self.Content[i]=='-':
            self.Content[i]=content
          else:
            self.Content[i]+= "|" + content

 ##dblp abbreviates urls that point to their own site, so we will prepend those to get the full, valid url
 ##          if "db/" in self.Content[i]:
 ##            self.Content[i]="http://dblp.uni-trier.de/"+self.Content[i]
 ##          if "issn=" in self.Content[i]:
 ##            if re.search(r"issn=(\w{4}-\w{4})",self.Content[i]):
 ##              issn=re.search(r"issn=(\w{4}-\w{4})",self.Content[i])
 ##              self.Content[issn]=issn.group(1)

  def endElement(self, name):
 ##is there a better way to close the program after it runs through the xml file?
    if name == "author":
      firstname = "-"
      lastname = "-"
      firstinit = "-"
      middleinit = "-"
      suffix = "-"
 #      fullname = unicodedata.normalize('NFKD',self.Content[name])
      fullname = self.Content[name]
      splitted = fullname.split('|')
      print splitted
      print fullname
 #            fullname = unicodedata.normalize('NFKD', content).encode('utf-8','ignore')
 #            fullname = fullname.encode('ascii', 'replace')
      print fullname
      #if only one name, assign to last name and skip to next:
      if len(fullname.split()) == 1:
        lastname = fullname
        return
      #check for suffixes
      lastpiece = fullname.split()[-1]
      suffixes = ["jr.","sr.","ii","iii","iv","v"]
      if lastpiece.lower() in suffixes:
        suffix = lastpiece
        lastname = fullname.split()[-2]
      #grab last name
      else:
        lastname = lastpiece
      #assign everything else to first name
        firstname = fullname.split()[0:(len(fullname.split())-1)]
        firstname = ' '.join(firstname)
      #grab 1st, 2nd initials
        namelength = len(firstname.split())
        if namelength > 1:
          middleinit = firstname.split()[1][0]
          firstinit = firstname.split()[0][0]
        else:
          firstinit = firstname.split()[0][0]
      #if firstname is 1 character long (an initial), get rid of it
          if len(firstname.split()[0].replace(".","")) == 1:
            firstname = "-"
          if self.Content[name] == "-":
            self.Content[name] = lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix
          else:
            self.Content[name] += lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix

    if name == "month":
      month=self.Content[name]
      self.Content[name]=month[0:3].lower()
      self.Months=("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
      if self.Content[name] not in self.Months:
        self.Content[name]="-"
    if name == "ee":
      if re.search(r"issn=(\w{4}-\w{4})",self.Content[name]):
        self.issn2=re.search(r"issn=(\w{4}-\w{4})",self.Content[name])
        self.issn2=self.issn2.group(1)
        self.Content["issn"]=self.issn2
      if "db/" in self.Content[name]:
        self.Content[name]="http://dblp.uni-trier.de/"+self.Content[name]
    if name == "dblp":
      return
 ##if an end tag closing an item is detected, we print out all the contents in tsv
    if name in self.itemtypes:
      out = "\t".join(map(lambda w: self.Content[w],self.fields))
      out = self.articleKey+"\t"+out

 #      print unicodedata.normalize('NFKD', out).encode('utf-8','ignore')
      print out
    self.isElement[name] = 0
          
 def main(sourceFileName):
  source = open(sourceFileName)
  xml.sax.parse(source, ABContentHandler())
 
 if __name__ == "__main__":
 #  main("dblp.xml")
  main("removeSS.xml")
	# -- coding: latin-1 --
	import xml.sax
	import re
	import unicodedata
	import codecs

	class ABContentHandler(xml.sax.ContentHandler):
	def __init__(self):
	##define the second level item types we are interested in (these are all 8)
	self.itemtypes = ("article","inproceedings","proceedings","book","incollection","phdthesis","mastersthesis","www")
	##define the third level elements we are interested in (these are the details of self.itemtypes)
	self.fields = ("author","title","journal","issn","year","month","volume","number","pages","ee","publisher")
	##print the header
	print "Key\t"+"\t".join(self.fields)
	xml.sax.ContentHandler.__init__(self)
	##declare our empty dictionaries for tag presence and contents
	self.isElement = {}
	self.Content = {}
	##set all tags to not present (0)
	for i in self.fields:
	self.isElement[i] = 0
	self.issn2="-"

	def startElement(self, name, attrs):
	##check if xml tag signifies the beginning of a new item (article, book, etc.)
	if name in self.itemtypes:
	self.articleKey=attrs.get('key','')
	for i in iter(self.isElement):
	self.Content[i] = '-'
	##otherwise, the xml tag contains a piece of information to be read
	elif name in self.fields:
	self.isElement[name] = 1
	##if there are multiple sets of the same tag, concatenate the contents with a pipe separator
	if self.Content[name] != '-':
	self.Content[name] += '\|'

	def characters(self, content):
	for i in self.fields:
	if self.isElement[i]==1:
	if self.Content[i]=='-':
	self.Content[i]=content
	else:
	self.Content[i]+= "\|" + content

	##dblp abbreviates urls that point to their own site, so we will prepend those to get the full, valid url
	## if "db/" in self.Content[i]:
	## self.Content[i]="http://dblp.uni-trier.de/"+self.Content[i]
	## if "issn=" in self.Content[i]:
	## if re.search(r"issn=(\w{4}-\w{4})",self.Content[i]):
	## issn=re.search(r"issn=(\w{4}-\w{4})",self.Content[i])
	## self.Content[issn]=issn.group(1)

	def endElement(self, name):
	##is there a better way to close the program after it runs through the xml file?
	if name == "author":
	firstname = "-"
	lastname = "-"
	firstinit = "-"
	middleinit = "-"
	suffix = "-"
	# fullname = unicodedata.normalize('NFKD',self.Content[name])
	fullname = self.Content[name]
	splitted = fullname.split('\|')
	print splitted
	print fullname
	# fullname = unicodedata.normalize('NFKD', content).encode('utf-8','ignore')
	# fullname = fullname.encode('ascii', 'replace')
	print fullname
	#if only one name, assign to last name and skip to next:
	if len(fullname.split()) == 1:
	lastname = fullname
	return
	#check for suffixes
	lastpiece = fullname.split()[-1]
	suffixes = ["jr.","sr.","ii","iii","iv","v"]
	if lastpiece.lower() in suffixes:
	suffix = lastpiece
	lastname = fullname.split()[-2]
	#grab last name
	else:
	lastname = lastpiece
	#assign everything else to first name
	firstname = fullname.split()[0:(len(fullname.split())-1)]
	firstname = ' '.join(firstname)
	#grab 1st, 2nd initials
	namelength = len(firstname.split())
	if namelength > 1:
	middleinit = firstname.split()[1][0]
	firstinit = firstname.split()[0][0]
	else:
	firstinit = firstname.split()[0][0]
	#if firstname is 1 character long (an initial), get rid of it
	if len(firstname.split()[0].replace(".","")) == 1:
	firstname = "-"
	if self.Content[name] == "-":
	self.Content[name] = lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix
	else:
	self.Content[name] += lastname+"\\"+firstname[1:]+"\\"+firstinit+"\\"+middleinit+"\\"+suffix

	if name == "month":
	month=self.Content[name]
	self.Content[name]=month[0:3].lower()
	self.Months=("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
	if self.Content[name] not in self.Months:
	self.Content[name]="-"
	if name == "ee":
	if re.search(r"issn=(\w{4}-\w{4})",self.Content[name]):
	self.issn2=re.search(r"issn=(\w{4}-\w{4})",self.Content[name])
	self.issn2=self.issn2.group(1)
	self.Content["issn"]=self.issn2
	if "db/" in self.Content[name]:
	self.Content[name]="http://dblp.uni-trier.de/"+self.Content[name]
	if name == "dblp":
	return
	##if an end tag closing an item is detected, we print out all the contents in tsv
	if name in self.itemtypes:
	out = "\t".join(map(lambda w: self.Content[w],self.fields))
	out = self.articleKey+"\t"+out

	# print unicodedata.normalize('NFKD', out).encode('utf-8','ignore')
	print out
	self.isElement[name] = 0

	def main(sourceFileName):
	source = open(sourceFileName)
	xml.sax.parse(source, ABContentHandler())

	if __name__ == "__main__":
	# main("dblp.xml")
	main("removeSS.xml")
No results found