tomonari-masada · July 25, 2018 08:09
diff --git a/dblp_parse.py b/dblp_parse.py
 # -*- coding: utf-8 -*-
 from lxml import etree
 import os
 import sys
 from io import TextIOWrapper
 from nltk.tokenize import RegexpTokenizer

 #
 # USAGE:
 #
 # This code outputs the year and the title of each entry. (If you need author names, you may modify the code.)
 #
 # 1. Preprocess dblp.xml and make dblp._no_tags_.xml, which is parsed by this code.
 # $ cat dblp.xml | sed 's/<i>//g' | sed 's/<\/i>//g' | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed 's/<sub>//g' | sed 's/<\sub>//g' | sed 's/<tt>//g' | sed 's/<\/tt>//g' > dblp._no_tags_.xml
 #
 # 2. Make tags.txt, which is read in this code.
 # $ cat dblp.xml | awk '{if(substr($1,1,2)=="</"){split($1,a,">");print substr(a[1],3,length(a[1]))}}' | uniq | sort | uniq > tags.txt
 #
 # 3. Run this code
 # $ python dblp_parse.py
 #

 sys.stdout = TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

 tokenizer = RegexpTokenizer(r'\w+')

 with open('tags.txt') as f:
    collaborations = f.read().splitlines()

 def fast_iter(context):
    #author_array = []
    title = ''
    year = ''

    #read chunk line by line
    #we focus author and title
    for event, elem in context:
        if elem.tag == 'title':
            if elem.text:
                title = elem.text
        if elem.tag == 'year':
            if elem.text:
                year = elem.text
        if elem.tag in collaborations:
            if title and year:
                year = int(year)
                print('{:d}'.format(year), end='')
                for word in tokenizer.tokenize(title):
                    print(' {:s}'.format(word), end='')
                print(flush=True)
                title = ''
                year = ''
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context

 if __name__ == "__main__":
    context = etree.iterparse('dblp._no_tags_.xml', load_dtd=True, html=True)
    fast_iter(context)
	# -- coding: utf-8 --
	from lxml import etree
	import os
	import sys
	from io import TextIOWrapper
	from nltk.tokenize import RegexpTokenizer

	#
	# USAGE:
	#
	# This code outputs the year and the title of each entry. (If you need author names, you may modify the code.)
	#
	# 1. Preprocess dblp.xml and make dblp._no_tags_.xml, which is parsed by this code.
	# $ cat dblp.xml \| sed 's/<i>//g' \| sed 's/<\/i>//g' \| sed 's/<sup>//g' \| sed 's/<\/sup>//g' \| sed 's/<sub>//g' \| sed 's/<\sub>//g' \| sed 's/<tt>//g' \| sed 's/<\/tt>//g' > dblp._no_tags_.xml
	#
	# 2. Make tags.txt, which is read in this code.
	# $ cat dblp.xml \| awk '{if(substr($1,1,2)=="</"){split($1,a,">");print substr(a[1],3,length(a[1]))}}' \| uniq \| sort \| uniq > tags.txt
	#
	# 3. Run this code
	# $ python dblp_parse.py
	#

	sys.stdout = TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

	tokenizer = RegexpTokenizer(r'\w+')

	with open('tags.txt') as f:
	collaborations = f.read().splitlines()

	def fast_iter(context):
	#author_array = []
	title = ''
	year = ''

	#read chunk line by line
	#we focus author and title
	for event, elem in context:
	if elem.tag == 'title':
	if elem.text:
	title = elem.text
	if elem.tag == 'year':
	if elem.text:
	year = elem.text
	if elem.tag in collaborations:
	if title and year:
	year = int(year)
	print('{:d}'.format(year), end='')
	for word in tokenizer.tokenize(title):
	print(' {:s}'.format(word), end='')
	print(flush=True)
	title = ''
	year = ''
	elem.clear()
	while elem.getprevious() is not None:
	del elem.getparent()[0]
	del context

	if __name__ == "__main__":
	context = etree.iterparse('dblp._no_tags_.xml', load_dtd=True, html=True)
	fast_iter(context)