Skip to content

Instantly share code, notes, and snippets.

@bxshi
Created January 27, 2015 17:04
Show Gist options
  • Select an option

  • Save bxshi/a9e70a8e2d5e05ffa6fc to your computer and use it in GitHub Desktop.

Select an option

Save bxshi/a9e70a8e2d5e05ffa6fc to your computer and use it in GitHub Desktop.
DBLP2CSV Parser
#!/usr/bin/python3
import csv
import os
import sys
from lxml import etree
if len(sys.argv) is not 5:
print("Usage simple_parser.py dblp.xml article.csv author.csv citation.csv")
exit(1)
# Bib instances
element_type = ["article", "inproceedings","proceedings",
"book","incollection","phdthesis","mastersthesis","www"]
# Article attributes
inst_attr = ["key", "mdate", "publtype", "reviewid", "rating", "type", "title",
"booktitle", "pages", "year", "address", "journal", "volume",
"number", "month", "url", "ee", "cdrom", "publisher", "note",
"isbn", "series", "school", "chapter"]
# Author attributes
auth_attr = ["person", "paper", "rel_type"] # rel_type indicates whether an author or editor type
# Citation attributes
cite_attr = ["paper", "cites", "cite_type"] # cite_type indicates whether a regular paper cite or crossref
#CSV dict writers
inst_writer = csv.DictWriter(open(sys.argv[2], "w"), inst_attr, dialect="unix")
auth_writer = csv.DictWriter(open(sys.argv[3], "w"), auth_attr, dialect="unix")
cite_writer = csv.DictWriter(open(sys.argv[4], "w"), cite_attr, dialect="unix")
#Write headers for each CSV file
inst_writer.writeheader()
auth_writer.writeheader()
cite_writer.writeheader()
xml_parser = etree.XMLParser(load_dtd=True, dtd_validation=True, attribute_defaults=True)
# Open XML file
with open(sys.argv[1]) as f:
is_in_instance = False
tmp_instance_type = ""
tmp_instance = ""
tmp_key = ""
buf = ""
cnt = 0
for line in f:
# If inside a element instance
if is_in_instance is True:
# Check if reaches the end
if len(line) > 2 and line[0:len(tmp_instance_type)+3] == "</"+tmp_instance_type+">":
is_in_instance = False
# Add line to instance buffer
if len(line) == len(tmp_instance_type) + 3:
tmp_instance += line
buf = ""
else:
buf = tmp_instance[len(tmp_instance_type)+4:]
tmp_instance += line[0:len(tmp_instance_type)+3]
else:
tmp_instance += line
if is_in_instance is False:
xml_inst = etree.XML("<!DOCTYPE dblp SYSTEM \"dblp.dtd\">\n<dblp>\n"+tmp_instance+"\n</dblp>", parser=xml_parser)
xml_inst = xml_inst.find(tmp_instance_type)
tmp_key = xml_inst.get("key")
# Output
# Fill up missing attributes with default values
tmp_attrs = xml_inst.attrib
tmp_attrs["type"] = tmp_instance_type
for attr in inst_attr:
if attr not in tmp_attrs:
tmp_attrs[attr] = ''
tmp_auth = []
tmp_cite = []
# Extract subelements from xml instance
for subelem in xml_inst:
# Put article related attributes into tmp_attrs
if subelem.tag in inst_attr:
if subelem.tag == "title":
tmp_attrs[subelem.tag] = etree.tostring(subelem)
else:
tmp_attrs[subelem.tag] = subelem.text
elif subelem.tag == "author" or subelem.tag == "editor":
tmp_auth.append({"person": subelem.text,
"paper": tmp_key,
"rel_type": subelem.tag})
elif subelem.tag == "cite" or subelem.tag == "crossref":
tmp_cite.append({"paper": tmp_key,
"cites": subelem.text,
"cite_type": subelem.tag})
inst_writer.writerow(tmp_attrs)
auth_writer.writerows(tmp_auth)
cite_writer.writerows(tmp_cite)
cnt += 1
# Clean up
is_in_instance = False
tmp_instance_type = ""
tmp_instance = ""
tmp_key = ""
line = buf
buf = ""
if line is not "":
if len(line) > 2 and line[0] is '<' and line[1] is not '/':
off = 0
while line[off] is not '>' and line[off] is not ' ':
off = off + 1
# This is an bib instance
if line[1:off] in element_type:
# Start putting lines into instance buffer
is_in_instance = True
tmp_instance += line
tmp_instance_type = line[1:off]
if cnt % 10000 == 0:
print(cnt, "done")
# Locate the start of an instance
elif len(line) > 2 and line[0] is '<' and line[1] is not '/':
off = 0
while line[off] is not '>' and line[off] is not ' ':
off = off + 1
# This is an bib instance
if line[1:off] in element_type:
# Start putting lines into instance buffer
is_in_instance = True
tmp_instance += line
tmp_instance_type = line[1:off]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment