Created
July 25, 2018 08:09
-
-
Save tomonari-masada/1568b8affc124490df85c35c5f461574 to your computer and use it in GitHub Desktop.
A Python parser for dblp.xml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from lxml import etree | |
import os | |
import sys | |
from io import TextIOWrapper | |
from nltk.tokenize import RegexpTokenizer | |
# | |
# USAGE: | |
# | |
# This code outputs the year and the title of each entry. (If you need author names, you may modify the code.) | |
# | |
# 1. Preprocess dblp.xml and make dblp._no_tags_.xml, which is parsed by this code. | |
# $ cat dblp.xml | sed 's/<i>//g' | sed 's/<\/i>//g' | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed 's/<sub>//g' | sed 's/<\sub>//g' | sed 's/<tt>//g' | sed 's/<\/tt>//g' > dblp._no_tags_.xml | |
# | |
# 2. Make tags.txt, which is read in this code. | |
# $ cat dblp.xml | awk '{if(substr($1,1,2)=="</"){split($1,a,">");print substr(a[1],3,length(a[1]))}}' | uniq | sort | uniq > tags.txt | |
# | |
# 3. Run this code | |
# $ python dblp_parse.py | |
# | |
sys.stdout = TextIOWrapper(sys.stdout.buffer, encoding='utf-8') | |
tokenizer = RegexpTokenizer(r'\w+') | |
with open('tags.txt') as f: | |
collaborations = f.read().splitlines() | |
def fast_iter(context): | |
#author_array = [] | |
title = '' | |
year = '' | |
#read chunk line by line | |
#we focus author and title | |
for event, elem in context: | |
if elem.tag == 'title': | |
if elem.text: | |
title = elem.text | |
if elem.tag == 'year': | |
if elem.text: | |
year = elem.text | |
if elem.tag in collaborations: | |
if title and year: | |
year = int(year) | |
print('{:d}'.format(year), end='') | |
for word in tokenizer.tokenize(title): | |
print(' {:s}'.format(word), end='') | |
print(flush=True) | |
title = '' | |
year = '' | |
elem.clear() | |
while elem.getprevious() is not None: | |
del elem.getparent()[0] | |
del context | |
if __name__ == "__main__": | |
context = etree.iterparse('dblp._no_tags_.xml', load_dtd=True, html=True) | |
fast_iter(context) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment