bxshi · January 27, 2015 17:04
diff --git a/dblp_parser.py b/dblp_parser.py
 #!/usr/bin/python3

 import csv
 import os
 import sys
 from lxml import etree

 if len(sys.argv) is not 5:
    print("Usage simple_parser.py dblp.xml article.csv author.csv citation.csv")
    exit(1)

 # Bib instances
 element_type = ["article", "inproceedings","proceedings",
                "book","incollection","phdthesis","mastersthesis","www"]

 # Article attributes
 inst_attr = ["key", "mdate", "publtype", "reviewid", "rating", "type", "title",
            "booktitle", "pages", "year", "address", "journal", "volume",
            "number", "month", "url", "ee", "cdrom", "publisher", "note",
            "isbn", "series", "school", "chapter"]

 # Author attributes
 auth_attr = ["person", "paper", "rel_type"] # rel_type indicates whether an author or editor type

 # Citation attributes
 cite_attr = ["paper", "cites", "cite_type"] # cite_type indicates whether a regular paper cite or crossref

 #CSV dict writers
 inst_writer = csv.DictWriter(open(sys.argv[2], "w"), inst_attr, dialect="unix")
 auth_writer = csv.DictWriter(open(sys.argv[3], "w"), auth_attr, dialect="unix")
 cite_writer = csv.DictWriter(open(sys.argv[4], "w"), cite_attr, dialect="unix")

 #Write headers for each CSV file
 inst_writer.writeheader()
 auth_writer.writeheader()
 cite_writer.writeheader()

 xml_parser = etree.XMLParser(load_dtd=True, dtd_validation=True, attribute_defaults=True)

 # Open XML file
 with open(sys.argv[1]) as f:

    is_in_instance = False
    tmp_instance_type = ""
    tmp_instance = ""
    tmp_key = ""
    buf = ""

    cnt = 0

    for line in f:
        # If inside a element instance
        if is_in_instance is True:
            # Check if reaches the end
            if len(line) > 2 and line[0:len(tmp_instance_type)+3] == "</"+tmp_instance_type+">":
                is_in_instance = False
            # Add line to instance buffer
                if len(line) == len(tmp_instance_type) + 3:
                    tmp_instance += line
                    buf = ""
                else:
                    buf = tmp_instance[len(tmp_instance_type)+4:]
                    tmp_instance += line[0:len(tmp_instance_type)+3]
            else:
                tmp_instance += line

            if is_in_instance is False:
                xml_inst = etree.XML("<!DOCTYPE dblp SYSTEM \"dblp.dtd\">\n<dblp>\n"+tmp_instance+"\n</dblp>", parser=xml_parser)
                xml_inst = xml_inst.find(tmp_instance_type)
                tmp_key = xml_inst.get("key")

                # Output

                # Fill up missing attributes with default values
                tmp_attrs = xml_inst.attrib
                tmp_attrs["type"] = tmp_instance_type
                for attr in inst_attr:
                    if attr not in tmp_attrs:
                        tmp_attrs[attr] = ''

                tmp_auth = []
                tmp_cite = []

                # Extract subelements from xml instance
                for subelem in xml_inst:

                    # Put article related attributes into tmp_attrs
                    if subelem.tag in inst_attr:
                        if subelem.tag == "title":
                            tmp_attrs[subelem.tag] = etree.tostring(subelem)
                        else:
                            tmp_attrs[subelem.tag] = subelem.text

                    elif subelem.tag == "author" or subelem.tag == "editor":
                        tmp_auth.append({"person": subelem.text,
                                         "paper": tmp_key,
                                         "rel_type": subelem.tag})
                    elif subelem.tag == "cite" or subelem.tag == "crossref":
                        tmp_cite.append({"paper": tmp_key,
                                         "cites": subelem.text,
                                         "cite_type": subelem.tag})

                inst_writer.writerow(tmp_attrs)
                auth_writer.writerows(tmp_auth)
                cite_writer.writerows(tmp_cite)
                cnt += 1

                # Clean up
                is_in_instance = False
                tmp_instance_type = ""
                tmp_instance = ""
                tmp_key = ""

                line = buf
                buf = ""
                if line is not "":
                    if len(line) > 2 and line[0] is '<' and line[1] is not '/':
                        off = 0
                        while line[off] is not '>' and line[off] is not ' ':
                            off = off + 1
                        # This is an bib instance
                        if line[1:off] in element_type:
                        # Start putting lines into instance buffer
                            is_in_instance = True
                            tmp_instance += line
                            tmp_instance_type = line[1:off]

                if cnt % 10000 == 0:
                    print(cnt, "done")

        # Locate the start of an instance
        elif len(line) > 2 and line[0] is '<' and line[1] is not '/':
            off = 0
            while line[off] is not '>' and line[off] is not ' ':
                off = off + 1
            # This is an bib instance
            if line[1:off] in element_type:
                # Start putting lines into instance buffer
                is_in_instance = True
                tmp_instance += line
                tmp_instance_type = line[1:off]
	#!/usr/bin/python3

	import csv
	import os
	import sys
	from lxml import etree

	if len(sys.argv) is not 5:
	print("Usage simple_parser.py dblp.xml article.csv author.csv citation.csv")
	exit(1)

	# Bib instances
	element_type = ["article", "inproceedings","proceedings",
	"book","incollection","phdthesis","mastersthesis","www"]

	# Article attributes
	inst_attr = ["key", "mdate", "publtype", "reviewid", "rating", "type", "title",
	"booktitle", "pages", "year", "address", "journal", "volume",
	"number", "month", "url", "ee", "cdrom", "publisher", "note",
	"isbn", "series", "school", "chapter"]

	# Author attributes
	auth_attr = ["person", "paper", "rel_type"] # rel_type indicates whether an author or editor type

	# Citation attributes
	cite_attr = ["paper", "cites", "cite_type"] # cite_type indicates whether a regular paper cite or crossref

	#CSV dict writers
	inst_writer = csv.DictWriter(open(sys.argv[2], "w"), inst_attr, dialect="unix")
	auth_writer = csv.DictWriter(open(sys.argv[3], "w"), auth_attr, dialect="unix")
	cite_writer = csv.DictWriter(open(sys.argv[4], "w"), cite_attr, dialect="unix")

	#Write headers for each CSV file
	inst_writer.writeheader()
	auth_writer.writeheader()
	cite_writer.writeheader()

	xml_parser = etree.XMLParser(load_dtd=True, dtd_validation=True, attribute_defaults=True)

	# Open XML file
	with open(sys.argv[1]) as f:

	is_in_instance = False
	tmp_instance_type = ""
	tmp_instance = ""
	tmp_key = ""
	buf = ""

	cnt = 0

	for line in f:
	# If inside a element instance
	if is_in_instance is True:
	# Check if reaches the end
	if len(line) > 2 and line[0:len(tmp_instance_type)+3] == "</"+tmp_instance_type+">":
	is_in_instance = False
	# Add line to instance buffer
	if len(line) == len(tmp_instance_type) + 3:
	tmp_instance += line
	buf = ""
	else:
	buf = tmp_instance[len(tmp_instance_type)+4:]
	tmp_instance += line[0:len(tmp_instance_type)+3]
	else:
	tmp_instance += line

	if is_in_instance is False:
	xml_inst = etree.XML("<!DOCTYPE dblp SYSTEM \"dblp.dtd\">\n<dblp>\n"+tmp_instance+"\n</dblp>", parser=xml_parser)
	xml_inst = xml_inst.find(tmp_instance_type)
	tmp_key = xml_inst.get("key")

	# Output

	# Fill up missing attributes with default values
	tmp_attrs = xml_inst.attrib
	tmp_attrs["type"] = tmp_instance_type
	for attr in inst_attr:
	if attr not in tmp_attrs:
	tmp_attrs[attr] = ''

	tmp_auth = []
	tmp_cite = []

	# Extract subelements from xml instance
	for subelem in xml_inst:

	# Put article related attributes into tmp_attrs
	if subelem.tag in inst_attr:
	if subelem.tag == "title":
	tmp_attrs[subelem.tag] = etree.tostring(subelem)
	else:
	tmp_attrs[subelem.tag] = subelem.text

	elif subelem.tag == "author" or subelem.tag == "editor":
	tmp_auth.append({"person": subelem.text,
	"paper": tmp_key,
	"rel_type": subelem.tag})
	elif subelem.tag == "cite" or subelem.tag == "crossref":
	tmp_cite.append({"paper": tmp_key,
	"cites": subelem.text,
	"cite_type": subelem.tag})

	inst_writer.writerow(tmp_attrs)
	auth_writer.writerows(tmp_auth)
	cite_writer.writerows(tmp_cite)
	cnt += 1

	# Clean up
	is_in_instance = False
	tmp_instance_type = ""
	tmp_instance = ""
	tmp_key = ""

	line = buf
	buf = ""
	if line is not "":
	if len(line) > 2 and line[0] is '<' and line[1] is not '/':
	off = 0
	while line[off] is not '>' and line[off] is not ' ':
	off = off + 1
	# This is an bib instance
	if line[1:off] in element_type:
	# Start putting lines into instance buffer
	is_in_instance = True
	tmp_instance += line
	tmp_instance_type = line[1:off]

	if cnt % 10000 == 0:
	print(cnt, "done")

	# Locate the start of an instance
	elif len(line) > 2 and line[0] is '<' and line[1] is not '/':
	off = 0
	while line[off] is not '>' and line[off] is not ' ':
	off = off + 1
	# This is an bib instance
	if line[1:off] in element_type:
	# Start putting lines into instance buffer
	is_in_instance = True
	tmp_instance += line
	tmp_instance_type = line[1:off]
No results found