Created
January 27, 2015 17:04
-
-
Save bxshi/a9e70a8e2d5e05ffa6fc to your computer and use it in GitHub Desktop.
DBLP2CSV Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python3 | |
| import csv | |
| import os | |
| import sys | |
| from lxml import etree | |
| if len(sys.argv) is not 5: | |
| print("Usage simple_parser.py dblp.xml article.csv author.csv citation.csv") | |
| exit(1) | |
| # Bib instances | |
| element_type = ["article", "inproceedings","proceedings", | |
| "book","incollection","phdthesis","mastersthesis","www"] | |
| # Article attributes | |
| inst_attr = ["key", "mdate", "publtype", "reviewid", "rating", "type", "title", | |
| "booktitle", "pages", "year", "address", "journal", "volume", | |
| "number", "month", "url", "ee", "cdrom", "publisher", "note", | |
| "isbn", "series", "school", "chapter"] | |
| # Author attributes | |
| auth_attr = ["person", "paper", "rel_type"] # rel_type indicates whether an author or editor type | |
| # Citation attributes | |
| cite_attr = ["paper", "cites", "cite_type"] # cite_type indicates whether a regular paper cite or crossref | |
| #CSV dict writers | |
| inst_writer = csv.DictWriter(open(sys.argv[2], "w"), inst_attr, dialect="unix") | |
| auth_writer = csv.DictWriter(open(sys.argv[3], "w"), auth_attr, dialect="unix") | |
| cite_writer = csv.DictWriter(open(sys.argv[4], "w"), cite_attr, dialect="unix") | |
| #Write headers for each CSV file | |
| inst_writer.writeheader() | |
| auth_writer.writeheader() | |
| cite_writer.writeheader() | |
| xml_parser = etree.XMLParser(load_dtd=True, dtd_validation=True, attribute_defaults=True) | |
| # Open XML file | |
| with open(sys.argv[1]) as f: | |
| is_in_instance = False | |
| tmp_instance_type = "" | |
| tmp_instance = "" | |
| tmp_key = "" | |
| buf = "" | |
| cnt = 0 | |
| for line in f: | |
| # If inside a element instance | |
| if is_in_instance is True: | |
| # Check if reaches the end | |
| if len(line) > 2 and line[0:len(tmp_instance_type)+3] == "</"+tmp_instance_type+">": | |
| is_in_instance = False | |
| # Add line to instance buffer | |
| if len(line) == len(tmp_instance_type) + 3: | |
| tmp_instance += line | |
| buf = "" | |
| else: | |
| buf = tmp_instance[len(tmp_instance_type)+4:] | |
| tmp_instance += line[0:len(tmp_instance_type)+3] | |
| else: | |
| tmp_instance += line | |
| if is_in_instance is False: | |
| xml_inst = etree.XML("<!DOCTYPE dblp SYSTEM \"dblp.dtd\">\n<dblp>\n"+tmp_instance+"\n</dblp>", parser=xml_parser) | |
| xml_inst = xml_inst.find(tmp_instance_type) | |
| tmp_key = xml_inst.get("key") | |
| # Output | |
| # Fill up missing attributes with default values | |
| tmp_attrs = xml_inst.attrib | |
| tmp_attrs["type"] = tmp_instance_type | |
| for attr in inst_attr: | |
| if attr not in tmp_attrs: | |
| tmp_attrs[attr] = '' | |
| tmp_auth = [] | |
| tmp_cite = [] | |
| # Extract subelements from xml instance | |
| for subelem in xml_inst: | |
| # Put article related attributes into tmp_attrs | |
| if subelem.tag in inst_attr: | |
| if subelem.tag == "title": | |
| tmp_attrs[subelem.tag] = etree.tostring(subelem) | |
| else: | |
| tmp_attrs[subelem.tag] = subelem.text | |
| elif subelem.tag == "author" or subelem.tag == "editor": | |
| tmp_auth.append({"person": subelem.text, | |
| "paper": tmp_key, | |
| "rel_type": subelem.tag}) | |
| elif subelem.tag == "cite" or subelem.tag == "crossref": | |
| tmp_cite.append({"paper": tmp_key, | |
| "cites": subelem.text, | |
| "cite_type": subelem.tag}) | |
| inst_writer.writerow(tmp_attrs) | |
| auth_writer.writerows(tmp_auth) | |
| cite_writer.writerows(tmp_cite) | |
| cnt += 1 | |
| # Clean up | |
| is_in_instance = False | |
| tmp_instance_type = "" | |
| tmp_instance = "" | |
| tmp_key = "" | |
| line = buf | |
| buf = "" | |
| if line is not "": | |
| if len(line) > 2 and line[0] is '<' and line[1] is not '/': | |
| off = 0 | |
| while line[off] is not '>' and line[off] is not ' ': | |
| off = off + 1 | |
| # This is an bib instance | |
| if line[1:off] in element_type: | |
| # Start putting lines into instance buffer | |
| is_in_instance = True | |
| tmp_instance += line | |
| tmp_instance_type = line[1:off] | |
| if cnt % 10000 == 0: | |
| print(cnt, "done") | |
| # Locate the start of an instance | |
| elif len(line) > 2 and line[0] is '<' and line[1] is not '/': | |
| off = 0 | |
| while line[off] is not '>' and line[off] is not ' ': | |
| off = off + 1 | |
| # This is an bib instance | |
| if line[1:off] in element_type: | |
| # Start putting lines into instance buffer | |
| is_in_instance = True | |
| tmp_instance += line | |
| tmp_instance_type = line[1:off] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment